#The “Auto MPG” dataset is derived from the U.S. Environmental Protection Agency (EPA), focusing on fuel efficiency data from 1970 to 1982. Here’s an explanation of the collection process:
#Source: The EPA collected the data to evaluate fuel economy during the oil crisis era. #Scope: It covers vehicles from multiple origins (USA, Europe, and Japan). #Variables: There are nine attributes: miles per gallon (MPG), cylinders, displacement, horsepower, weight, acceleration, model year, origin, and car name. #Purpose: To assess the fuel efficiency of cars, addressing environmental and energy concerns
#The Auto MPG dataset contains information about various car models and their fuel efficiency. It includes attributes such as cylinders, displacement, horsepower, weight, acceleration, model year, and origin. #1.mpg: Miles per gallon, the target variable representing fuel efficiency. #2.cylinders: Number of cylinders in the engine (typically 4, 6, or 8). #3.displacement: Engine displacement in cubic inches. #4.horsepower: Engine horsepower. #5.weight: Vehicle weight in pounds. #6.acceleration: Time taken for the vehicle to accelerate from 0 to 60 miles per hour (mph). #7.model year: Year of manufacturing (e.g., 70 for 1970, 71 for 1971, etc.). #8.origin: Origin of the car (1 for American, 2 for European, 3 for Japanese). #9.car name: Name of the car model.
#Considering independent variables cylinders,displacement,horsepower,weight,acceleration my aim is to analyze the variations in dependent variable mpg.
file_path <- "/Users/user/Downloads/auto-mpg.csv"
data <- read.csv(file_path)
autompg <- read.csv("/Users/user/Downloads/auto-mpg.csv")
str(autompg)
## 'data.frame': 398 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : chr "130" "165" "150" "150" ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ model.year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ car.name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
#Simple data cleaning
autompg <- read.csv("/Users/user/Downloads/auto-mpg.csv")
autompg$horsepower <- as.integer(autompg$horsepower)
## Warning: NAs introduced by coercion
auto <- na.omit(autompg)
str(auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ model.year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ car.name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## - attr(*, "na.action")= 'omit' Named int [1:6] 33 127 331 337 355 375
## ..- attr(*, "names")= chr [1:6] "33" "127" "331" "337" ...
#summary statistics
summary(auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
## acceleration model.year origin car.name
## Min. : 8.00 Min. :70.00 Min. :1.000 Length:392
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 Class :character
## Median :15.50 Median :76.00 Median :1.000 Mode :character
## Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :24.80 Max. :82.00 Max. :3.000
#Summary statistics explains us to understand the spread of the data.
# Check for missing values
colSums(is.na(auto))
## mpg cylinders displacement horsepower weight acceleration
## 0 0 0 0 0 0
## model.year origin car.name
## 0 0 0
# Distribution of MPG
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(auto, aes(x = mpg)) +
geom_histogram(binwidth = 2, fill = "steelblue", color = "black") +
labs(title = "Distribution of MPG")
#Insights from the above histogram #1.There are no extreme values(outliers) in the data #2.Mpg is slightly normal distributed.
# Densityplot of MPG
ggplot(auto, aes(x = mpg)) +
geom_density() +
labs(title = "Density Plot of mpg")
#From the density plot we got a smooth estimate of the probability
density function which tells that most of the mpg values in the dataset
are in the range of 10-40. And yes the values are distributed
normally.
# Analyzing the relationship between Mpg and horsepower
ggplot(auto, aes(x = mpg, y = horsepower)) +
geom_point(color = "pink") +
labs(title = "mpg Vs horsepower",
x = "mpg",
y = "horsepower") + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
#with increase in horsepower, mpg decreases with a linear trend.
# Analyzing the relationship between Mpg and weight
ggplot(auto, aes(x = mpg, y = weight)) +
geom_point(color = "pink") +
labs(title = "mpg Vs vehicle weight",
x = "mpg",
y = "vehicle weight") + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
#with increase in vehicle weight, mpg decreases with a linear trend.
# Analyzing the relationship between Mpg and acceleration
ggplot(auto, aes(x = mpg, y = acceleration)) +
geom_point(color = "pink") +
labs(title = "mpg Vs acceleration",
x = "mpg",
y = "acceleration") + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
#mpg is directly proportional to acceleration. with increase in
acceleration mpg also increases.
#Visualization of the relationship between mpg and displacement
library(ggplot2)
ggplot(autompg, aes(x = mpg, y = displacement)) +
geom_point(color = "pink") +
labs(title = "mpg versus displacement",
x = "miles per gallon(mpg)",
y = "displacement") + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
#with increase in displacement mpg decreases.
#Visualization of the relationship between weight and displacement
library(ggplot2)
ggplot(autompg, aes(x = weight, y = displacement)) +
geom_point(color = "pink") +
labs(title = "weight versus displacement",
x = "weight",
y = "displacement") + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
#weight is directly proportional to displacement. with increase in weight, displacement also increases.
boxplot(auto[,1])
boxplot(auto[,3])
boxplot(auto[,4])
boxplot(auto[,5])
boxplot(auto[,6])
#with box plot we can visualize the min , max, mean of each variables.
stars(auto)
#with starplot we can immediately identify the vehicles with
similarities.
pairs(auto[,1:6])
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggscatmat(auto, columns=1:6)
#How are the variables related to each other?
#Compute correlation matrix
cor_matrix <- cor(auto[, 1:6])
#Create a heatmap of the correlation matrix
heatmap(cor_matrix, main = "Heatmap of Correlation Matrix")
#To show relationship between multiple variables heatmap is useful.The pale color shows low correlation and the dark color shows high correlation among the variables.
#considering mpg(miles per gallon) variable for univariate analysis
mpg <- auto$mpg
#calculating mean and variance of the mpg variable
mpg_mean <- mean(mpg)
mpg_mean
## [1] 23.44592
mpg_variance <- var(mpg)
mpg_variance
## [1] 60.91814
#Visualization of the mpg distribution
hist(mpg, main = "Distribution of MPG", xlab = "Miles Per Gallon (MPG)", col = "skyblue", border = "black")
#Q-Q plot of mpg variable.
qqnorm(autompg$mpg, main = "QQ Plot of mpg")
qqline(autompg$mpg)
#By calculating mean of the mpg variable, we got an insight of the average fuel efficiency of the vehicles in the data set.
#Variance values quantifies the spread of mpg values around the mean. Higher variance in this case indicates greater variability in fuel efficiency among the vehicles.
#Histogram visualization of MPG variable provides a graphical representation of the distribution of fuel efficiency in the dataset.It helps us to identify any potential outliers.
#QQ plot provides an indication of univariate normality of the dataset. In our case, most of the data points fall on the 45-degree reference line stating the data is normally distributed.
#Hence with univariate mean and variance analysis we can access the typical fuel efficiency of vehicles in the dataset.
#Multivariate mean and variance analysis
y <- auto[, c(1:6)]
#Calculating mean and covariance
auto_colmean <- colMeans(y)
auto_covariance <- cov(y)
auto_d <- apply(y, MARGIN = 1, function(y)t(y - auto_colmean) %*% solve(auto_covariance) %*% (y - auto_colmean))
#mean
auto_colmean
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
#covariance
auto_covariance
## mpg cylinders displacement horsepower weight
## mpg 60.918142 -10.352928 -657.5852 -233.85793 -5517.4407
## cylinders -10.352928 2.909696 169.7219 55.34824 1300.4244
## displacement -657.585207 169.721949 10950.3676 3614.03374 82929.1001
## horsepower -233.857926 55.348244 3614.0337 1481.56939 28265.6202
## weight -5517.440704 1300.424363 82929.1001 28265.62023 721484.7090
## acceleration 9.115514 -2.375052 -156.9944 -73.18697 -976.8153
## acceleration
## mpg 9.115514
## cylinders -2.375052
## displacement -156.994435
## horsepower -73.186967
## weight -976.815253
## acceleration 7.611331
#Mahalanobis distances for each observation to the vector auto_d.
auto_d
## 1 2 3 4 5 6 7
## 6.0466033 5.4855309 6.2332533 6.5863928 7.4165260 12.6172373 21.8602741
## 8 9 10 11 12 13 14
## 17.7342966 24.2146795 10.6027127 11.6370855 9.5986696 14.8428922 72.7789543
## 15 16 17 18 19 20 21
## 2.5408270 1.7795195 4.0672082 5.3227595 2.5146385 9.1129679 2.3788192
## 22 23 24 25 26 27 28
## 2.9463436 4.0738180 5.6524721 4.1437857 20.3027094 23.7526454 23.5463035
## 29 30 31 32 34 35 36
## 32.5179885 2.5146385 2.5782475 3.0127631 7.7476428 2.8712219 4.1648545
## 37 38 39 40 41 42 43
## 6.9882010 2.3956907 2.7549581 5.9422008 2.3076187 3.3239950 9.3833584
## 44 45 46 47 48 49 50
## 7.4245271 13.1860908 5.8169441 6.0775781 3.7168751 8.7768321 4.1837420
## 51 52 53 54 55 56 57
## 2.4697694 4.2791798 2.4702876 5.6767249 7.2344989 6.0457929 9.0671181
## 58 59 60 61 62 63 64
## 2.9567746 2.8332854 13.2384975 10.1802011 5.1108680 2.9144611 5.7883029
## 65 66 67 68 69 70 71
## 3.2799918 2.3875246 4.9755627 12.2210451 3.8897386 3.2520049 7.6113823
## 72 73 74 75 76 77 78
## 12.7657703 4.2655596 5.6645884 6.1397810 3.0535501 8.6663568 3.1374016
## 79 80 81 82 83 84 85
## 6.2730766 1.9947506 2.9451928 3.0844359 3.0151389 1.6542663 3.0495152
## 86 87 88 89 90 91 92
## 4.5297676 5.4425689 3.3539556 4.3589891 3.4135037 11.4393871 9.3252498
## 93 94 95 96 97 98 99
## 2.8726999 3.7059977 15.9671886 20.3734838 6.0234134 1.8157042 5.4654934
## 100 101 102 103 104 105 106
## 3.7567292 7.6010098 1.2781284 8.2755578 12.5435125 9.0465337 4.9806190
## 107 108 109 110 111 112 113
## 5.1279021 5.3911528 9.2681043 7.6320933 3.9666294 13.5546650 8.2829361
## 114 115 116 117 118 119 120
## 8.4388597 2.2445531 2.9937487 22.1362714 4.4685795 3.2976521 6.0602905
## 121 122 123 124 125 126 128
## 7.2721939 7.2454455 4.4996009 9.1136485 9.7909390 1.7407838 3.6399810
## 129 130 131 132 133 134 135
## 5.1083176 3.8717657 1.1599144 9.7142697 2.7163191 6.0230236 4.2046286
## 136 137 138 139 140 141 142
## 2.8277289 5.0784614 6.2523643 6.3714823 10.9938141 5.9050895 1.3717961
## 143 144 145 146 147 148 149
## 3.7397301 3.0116555 4.5493703 2.7738522 2.6864176 3.4863252 3.8572672
## 150 151 152 153 154 155 156
## 2.3668529 1.8708240 1.9642297 2.4024806 2.5897327 14.5379282 14.6682092
## 157 158 159 160 161 162 163
## 8.4687762 4.3470671 6.8226963 7.0742376 9.4103952 6.0604355 5.9205037
## 164 165 166 167 168 169 170
## 6.1078241 0.8542144 10.6597661 11.9205059 1.0696669 2.7230575 3.0216100
## 171 172 173 174 175 176 177
## 3.6603986 4.5864682 2.2939626 2.9419187 5.9296637 3.7765016 3.4419507
## 178 179 180 181 182 183 184
## 3.6262134 4.6096947 6.6733153 5.5298118 3.4681733 1.5586787 2.2534254
## 185 186 187 188 189 190 191
## 2.3169235 2.0311813 1.5573265 6.9763367 4.0587509 6.7591541 2.7203276
## 192 193 194 195 196 197 198
## 1.2101570 2.8634552 2.7788899 3.0622173 8.2348904 9.7398240 3.4815705
## 199 200 201 202 203 204 205
## 3.4522778 3.4548162 11.0221788 2.5107250 6.8290917 7.3703182 1.8473117
## 206 207 208 209 210 211 212
## 1.1482065 8.3340515 7.8577447 3.2251029 13.7004243 7.1270088 14.9848512
## 213 214 215 216 217 218 219
## 5.7856327 4.1770997 4.7189831 4.5244841 2.5168285 1.6218193 4.3748227
## 220 221 222 223 224 225 226
## 2.3771857 2.3030895 4.2250759 13.1884871 3.3540124 7.1680915 2.1018279
## 227 228 229 230 231 232 233
## 1.1484585 3.2881854 4.8077016 7.1542955 3.4336085 8.6324589 3.2203691
## 234 235 236 237 238 239 240
## 2.6479072 3.0806880 2.0901740 2.5820051 1.4875672 2.5765401 1.6630043
## 241 242 243 244 245 246 247
## 2.7752529 7.6980164 6.2190793 14.2675551 14.0363164 4.9998920 3.1080286
## 248 249 250 251 252 253 254
## 7.3788166 3.8108934 9.3841631 3.6109858 4.9160849 3.6749249 2.2750284
## 255 256 257 258 259 260 261
## 3.5215395 2.7899699 1.6331334 3.2854507 1.0345339 2.8046714 3.5074604
## 262 263 264 265 266 267 268
## 1.5425528 6.2428526 9.4775427 8.7043291 3.6304859 1.0877698 2.5210617
## 269 270 271 272 273 274 275
## 2.0907946 2.4853044 3.7538431 4.2614673 3.6399891 2.5418167 4.4011201
## 276 277 278 279 280 281 282
## 9.6865239 6.4915588 13.4578010 2.3174668 1.0936535 0.1643588 3.5341659
## 283 284 285 286 287 288 289
## 3.5875228 3.3203424 0.6128022 3.8911146 4.5412015 3.9977686 3.5260916
## 290 291 292 293 294 295 296
## 3.8456361 3.0193188 7.5948773 4.0806958 3.5148747 3.0505088 4.1382115
## 297 298 299 300 301 302 303
## 3.7685671 9.1544181 9.8955237 17.9871997 21.3638855 6.8570559 3.4391228
## 304 305 306 307 308 309 310
## 3.2900173 5.7152645 2.8314413 7.8583939 5.2870299 7.6251703 9.7886019
## 311 312 313 314 315 316 317
## 5.3840354 1.8050832 3.8993109 2.7265317 3.2900184 6.7934643 3.5923418
## 318 319 320 321 322 323 324
## 2.5197479 3.1104028 2.0599602 6.6346415 2.0943973 16.4413445 4.3700811
## 325 326 327 328 329 330 332
## 8.9286687 15.9945221 20.3220019 12.6951839 13.1683172 13.8121637 2.1439904
## 333 334 335 336 338 339 340
## 3.2778466 15.3084655 11.8241009 4.9115795 1.4377330 1.6175342 2.7657884
## 341 342 343 344 345 346 347
## 3.8143292 4.9389936 5.2865052 5.5678246 5.2749629 3.5124596 1.7727543
## 348 349 350 351 352 353 354
## 6.0168573 4.1740763 2.2498095 5.0463574 2.3864797 3.9425656 3.4829273
## 356 357 358 359 360 361 362
## 3.4647993 1.5906841 5.2735988 2.9803976 9.5011647 12.6446882 6.4156855
## 363 364 365 366 367 368 369
## 10.6572712 0.8449586 18.3799151 2.4116509 6.4203338 4.8357267 3.1008496
## 370 371 372 373 374 376 377
## 4.7284353 2.4484209 1.7046545 3.4523095 3.0454206 3.6191359 4.5058594
## 378 379 380 381 382 383 384
## 2.0300983 7.0531257 3.1626516 4.6875593 4.7052032 2.1284542 5.2392466
## 385 386 387 388 389 390 391
## 1.9620319 4.4761121 3.3094538 20.1140633 3.6217100 2.3643049 5.5077094
## 392 393 394 395 396 397 398
## 8.2361691 4.1269839 3.3940012 26.1005519 8.6369022 2.0545078 4.6020091
#The above analysis explains that there is a high negative covariance between mpg and weight which denotes there is a inverse relationship between them. As the weight of the vehicle increases mpg tends to decrease.A the same time mpg and displacement are also negatively related which means as the engine size(displacement) increases mpg tends to decrease.
#Here multivariate mean and variance analysis helped to understand the relationship between different variables.
#dropping non-numeric data
auto_mpg <- auto[, -7:-9]
str(auto_mpg)
## 'data.frame': 392 obs. of 6 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
#Get the correlations between the variables
cor(auto_mpg, use = "complete.obs")
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## acceleration
## mpg 0.4233285
## cylinders -0.5046834
## displacement -0.5438005
## horsepower -0.6891955
## weight -0.4168392
## acceleration 1.0000000
#Computing Principal Components
auto_mpg <- na.omit(auto_mpg)
auto_pca <- prcomp(auto_mpg,scale=TRUE)
auto_pca
## Standard deviations (1, .., p=6):
## [1] 2.1882107 0.8535989 0.5083968 0.3538036 0.2513492 0.1904783
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5
## mpg 0.3989731 0.2448345 0.85211071 0.2208091 -0.071093900
## cylinders -0.4306152 -0.1483141 0.40032254 -0.5763105 0.285904005
## displacement -0.4435314 -0.1084971 0.29750498 -0.1107832 -0.005593684
## horsepower -0.4341217 0.1661584 0.02260028 0.6752369 0.558588045
## weight -0.4301031 -0.2860955 0.12470286 0.3409071 -0.726611860
## acceleration 0.2919257 -0.8926523 0.09528071 0.1862098 0.270532567
## PC6
## mpg 0.03119071
## cylinders 0.46737615
## displacement -0.83108055
## horsepower 0.12431178
## weight 0.27097508
## acceleration -0.03179437
summary(auto_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.188 0.8536 0.50840 0.35380 0.25135 0.19048
## Proportion of Variance 0.798 0.1214 0.04308 0.02086 0.01053 0.00605
## Cumulative Proportion 0.798 0.9195 0.96256 0.98342 0.99395 1.00000
#summary of auto_pca it’s evident that PC1 and PC2 plays an important role as they captures the highest proportion of variance.And we can discard the rest as it contributes less to the overall variance.Vehicles with higher values on PC1 tend to have higher fuel consumption (lower mpg), more cylinders, larger engine displacement, higher horsepower, and greater weight.
#eigen values
eigen_auto <- auto_pca$sdev^2
names(eigen_auto) <- paste("PC",1:6,sep="")
eigen_auto
## PC1 PC2 PC3 PC4 PC5 PC6
## 4.7882662 0.7286311 0.2584673 0.1251770 0.0631764 0.0362820
sumlambdas <- sum(eigen_auto)
sumlambdas
## [1] 6
propvar <- eigen_auto/sumlambdas
propvar
## PC1 PC2 PC3 PC4 PC5 PC6
## 0.79804436 0.12143852 0.04307789 0.02086284 0.01052940 0.00604700
cumvar_auto <- cumsum(propvar)
cumvar_auto
## PC1 PC2 PC3 PC4 PC5 PC6
## 0.7980444 0.9194829 0.9625608 0.9834236 0.9939530 1.0000000
matlambdas <- rbind(eigen_auto,propvar,cumvar_auto)
rownames(matlambdas) <- c("Eigenvalues","Prop. variance","Cum. prop. variance")
round(matlambdas,4)
## PC1 PC2 PC3 PC4 PC5 PC6
## Eigenvalues 4.7883 0.7286 0.2585 0.1252 0.0632 0.0363
## Prop. variance 0.7980 0.1214 0.0431 0.0209 0.0105 0.0060
## Cum. prop. variance 0.7980 0.9195 0.9626 0.9834 0.9940 1.0000
summary(auto_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.188 0.8536 0.50840 0.35380 0.25135 0.19048
## Proportion of Variance 0.798 0.1214 0.04308 0.02086 0.01053 0.00605
## Cumulative Proportion 0.798 0.9195 0.96256 0.98342 0.99395 1.00000
auto_pca$rotation
## PC1 PC2 PC3 PC4 PC5
## mpg 0.3989731 0.2448345 0.85211071 0.2208091 -0.071093900
## cylinders -0.4306152 -0.1483141 0.40032254 -0.5763105 0.285904005
## displacement -0.4435314 -0.1084971 0.29750498 -0.1107832 -0.005593684
## horsepower -0.4341217 0.1661584 0.02260028 0.6752369 0.558588045
## weight -0.4301031 -0.2860955 0.12470286 0.3409071 -0.726611860
## acceleration 0.2919257 -0.8926523 0.09528071 0.1862098 0.270532567
## PC6
## mpg 0.03119071
## cylinders 0.46737615
## displacement -0.83108055
## horsepower 0.12431178
## weight 0.27097508
## acceleration -0.03179437
print(auto_pca)
## Standard deviations (1, .., p=6):
## [1] 2.1882107 0.8535989 0.5083968 0.3538036 0.2513492 0.1904783
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5
## mpg 0.3989731 0.2448345 0.85211071 0.2208091 -0.071093900
## cylinders -0.4306152 -0.1483141 0.40032254 -0.5763105 0.285904005
## displacement -0.4435314 -0.1084971 0.29750498 -0.1107832 -0.005593684
## horsepower -0.4341217 0.1661584 0.02260028 0.6752369 0.558588045
## weight -0.4301031 -0.2860955 0.12470286 0.3409071 -0.726611860
## acceleration 0.2919257 -0.8926523 0.09528071 0.1862098 0.270532567
## PC6
## mpg 0.03119071
## cylinders 0.46737615
## displacement -0.83108055
## horsepower 0.12431178
## weight 0.27097508
## acceleration -0.03179437
#Screeplot
screeplot(auto_pca, type = "line")
#variate representation of PC1
#Graph PC1 and PC2 using x of prcomp()
plot(auto_pca$x[,1],auto_pca$x[,2])
#Let's see how much variation in the original data PC1 accounts for
pc1.var <- auto_pca$sdev^2
pc1.var
## [1] 4.7882662 0.7286311 0.2584673 0.1251770 0.0631764 0.0362820
#percentage of variation that PC1 accounts for
pc1.var.per <- round(pc1.var/sum(pc1.var)*100, 1)
pc1.var.per
## [1] 79.8 12.1 4.3 2.1 1.1 0.6
#using bar plot to visualize the above percentage
barplot(pc1.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")
#PC1 accounts for almost all the variation in the data this means that
there is a big difference between the PC1 and PC2 clusters
#Visualization
#Visualizing PC's and the variation
pca.data <- data.frame(Sample=rownames(auto_pca$x),
X=auto_pca$x[,1],
Y=auto_pca$x[,2])
pca.data
## Sample X Y
## 1 1 -2.323001697 0.571351964
## 2 2 -3.201964571 0.681869756
## 3 3 -2.666575807 0.992744448
## 4 4 -2.602139465 0.621974687
## 5 5 -2.596582479 1.092197046
## 6 6 -4.395835723 1.009492806
## 7 7 -4.913438189 1.366352082
## 8 8 -4.829346859 1.535208991
## 9 9 -4.904206865 1.039427128
## 10 10 -4.050405289 1.666109789
## 11 11 -3.491120599 1.198362034
## 12 12 -3.482118800 1.800030624
## 13 13 -3.490771208 1.189488863
## 14 14 -4.226191456 1.490428610
## 15 15 1.101132642 0.568018151
## 16 16 -0.046786636 -0.073797827
## 17 17 -0.248177175 -0.171804324
## 18 18 0.183874920 -0.229329639
## 19 19 1.470882529 0.891785757
## 20 20 2.677719448 -1.162874147
## 21 21 1.367819933 -0.341977489
## 22 22 1.100680192 0.694898527
## 23 23 1.453411754 -0.201186781
## 24 24 0.771790665 1.555539559
## 25 25 -0.004980648 0.096303063
## 26 26 -4.266190360 -0.388945764
## 27 27 -3.645540172 -0.641804156
## 28 28 -3.915589384 -0.095356403
## 29 29 -3.414908929 -1.952640621
## 30 30 1.470882529 0.891785757
## 31 31 1.355150122 0.518513192
## 32 32 1.119352270 0.971446992
## 34 34 -0.564408793 0.694349376
## 35 35 -0.887569688 -0.450950193
## 36 36 -0.830322130 -0.430035632
## 37 37 -0.579073521 -0.410005216
## 38 38 -0.682151198 -0.366194337
## 39 39 -3.461456805 0.314922968
## 40 40 -3.968194056 0.382139786
## 41 41 -3.143783183 -0.204727162
## 42 42 -2.993615966 -0.002147945
## 43 43 -4.303390001 0.193233471
## 44 44 -4.052805767 0.072424823
## 45 45 -4.308703815 -0.038698142
## 46 46 -0.951690663 0.406935261
## 47 47 1.548889520 -0.928357233
## 48 48 -0.757194935 -0.189688344
## 49 49 -0.653468219 -0.062914614
## 50 50 1.084527990 0.863221202
## 51 51 1.369549715 1.076225335
## 52 52 2.460965239 -0.672074674
## 53 53 1.830636684 0.965315616
## 54 54 2.701890395 -0.390833504
## 55 55 2.832212120 0.128321520
## 56 56 2.412724102 -0.585396447
## 57 57 2.371703688 -1.093468524
## 58 58 1.201637299 0.437900242
## 59 59 1.723315705 -0.013552307
## 60 60 2.539415407 -2.334247663
## 61 61 1.296548609 -1.075171710
## 62 62 1.243788981 -0.010432298
## 63 63 -3.545487754 0.261660787
## 64 64 -3.875284792 0.246969583
## 65 65 -2.909339547 -0.145693959
## 66 66 -3.184031056 -0.034527717
## 67 67 -2.724948674 0.734622655
## 68 68 -4.755134084 0.505275882
## 69 69 -3.389432356 -0.343638888
## 70 70 -3.473649790 -0.337930122
## 71 71 -4.061407991 0.106111303
## 72 72 1.120233165 1.050821593
## 73 73 -2.832769206 0.274226618
## 74 74 -2.667739706 -0.432679238
## 75 75 -2.646951122 -1.097459658
## 76 76 -2.878181412 -0.319306281
## 77 77 0.231810647 0.417718786
## 78 78 1.426337943 -0.602524965
## 79 79 1.177139902 -1.228340767
## 80 80 1.978767339 -0.372890446
## 81 81 1.156424939 0.125793045
## 82 82 1.661415694 0.078309565
## 83 83 0.877029718 0.654670208
## 84 84 1.643680095 0.714352669
## 85 85 1.697700760 0.254774529
## 86 86 -3.464352259 0.039877432
## 87 87 -2.878301411 0.640515939
## 88 88 -3.069285634 -0.051902583
## 89 89 -2.593116090 -0.508825725
## 90 90 -2.833876559 0.298445376
## 91 91 -4.699853547 0.224252434
## 92 92 -3.684442808 0.081072083
## 93 93 -3.410029157 -0.123128446
## 94 94 -2.906292076 -0.534976375
## 95 95 -4.830120212 0.552470612
## 96 96 -5.166973412 0.475964284
## 97 97 -3.577090181 0.770597711
## 98 98 -0.518498478 -0.604661649
## 99 99 -0.591081066 -1.253121507
## 100 100 -0.455563018 -0.412444159
## 101 101 -0.382090350 -0.670285715
## 102 102 0.021286278 -0.228122070
## 103 103 2.672394963 -1.363387359
## 104 104 -3.844940149 -0.808306406
## 105 105 -4.098198574 -0.187564558
## 106 106 -3.730867830 -0.178672825
## 107 107 -3.826806544 0.057480418
## 108 108 -0.482384654 -0.036342388
## 109 109 1.513773658 -0.833993293
## 110 110 1.554223319 -1.119147355
## 111 111 1.186544422 0.018453054
## 112 112 1.252374999 1.058619975
## 113 113 1.321925547 -0.752895568
## 114 114 -0.026916178 0.598146860
## 115 115 1.430424700 0.498985060
## 116 116 -3.014648272 -0.020825818
## 117 117 -4.603719795 1.392065522
## 118 118 2.798135397 -0.712969550
## 119 119 1.475254579 0.388872258
## 120 120 0.725388539 0.677064321
## 121 121 0.421655318 0.147423058
## 122 122 -2.801193241 0.911100058
## 123 123 0.646402368 0.851029343
## 124 124 -0.473986718 0.679437419
## 125 125 -3.613834501 0.792692765
## 126 126 -0.179418784 -0.550698026
## 128 128 -0.382165620 -0.366255190
## 129 129 -0.777381220 -0.980468042
## 130 130 2.555799974 -0.450111562
## 131 131 1.453116853 0.044727050
## 132 132 2.932734769 -1.027800052
## 133 133 1.388927199 -0.219318149
## 134 134 -0.951593632 -1.098983872
## 135 135 -0.917024953 -1.337482289
## 136 136 -0.767627366 -0.770376906
## 137 137 -2.627752823 -0.304703722
## 138 138 -3.326979068 -0.755134233
## 139 139 -3.123504894 -0.285518785
## 140 140 -2.770021221 -1.181956705
## 141 141 -2.751266845 -0.850755177
## 142 142 1.791833048 0.254810013
## 143 143 1.923281377 0.521117937
## 144 144 1.446468616 0.759989577
## 145 145 2.625572500 0.398524475
## 146 146 2.630797456 -0.466642145
## 147 147 1.700821477 0.875978191
## 148 148 1.610772996 0.432670595
## 149 149 1.374209515 0.907306804
## 150 150 0.989662270 0.529986093
## 151 151 1.290403179 0.459127954
## 152 152 2.213040830 0.503721183
## 153 153 -0.479912386 -0.502846934
## 154 154 -0.848516821 -0.582648263
## 155 155 -0.086939444 -2.427904586
## 156 156 -0.106917517 -1.850279145
## 157 157 -3.912863849 0.354582424
## 158 158 -3.090111260 -0.464965153
## 159 159 -2.936216755 -0.560148488
## 160 160 -3.342089632 -0.395731403
## 161 161 -0.573276159 -2.341418330
## 162 162 -0.908003146 -1.601807842
## 163 163 -0.911952044 -1.725417426
## 164 164 -0.477402112 -1.680372591
## 165 165 -0.564168107 0.017764179
## 166 166 -1.502445105 0.204393953
## 167 167 -2.376488278 0.528209327
## 168 168 1.857697752 0.399258781
## 169 169 1.147347338 -0.280193114
## 170 170 -0.337630715 -0.339264940
## 171 171 1.386259213 -0.771283423
## 172 172 0.675026887 0.924748082
## 173 173 1.754586866 0.084480160
## 174 174 1.177172060 -0.134954802
## 175 175 -0.341649001 0.110052461
## 176 176 1.850620179 1.110864172
## 177 177 -0.320538565 -0.837395121
## 178 178 0.878490338 0.426119683
## 179 179 1.014702185 -0.344981359
## 180 180 0.588103349 0.478717459
## 181 181 0.582650738 1.062056155
## 182 182 2.684837234 0.077293185
## 183 183 1.438861920 0.468097197
## 184 184 1.575446207 -0.027721963
## 185 185 0.959793460 0.523434364
## 186 186 1.792341750 -0.256958874
## 187 187 1.558514100 0.582957163
## 188 188 -2.707076217 0.037872428
## 189 189 -2.938978604 0.028928821
## 190 190 -2.356162013 -0.316151188
## 191 191 -3.211903452 0.012585036
## 192 192 -0.430743131 -0.182580096
## 193 193 -0.749092979 0.063867101
## 194 194 0.336440727 -0.813331094
## 195 195 -0.014337505 -0.879299424
## 196 196 2.892874477 -1.647837183
## 197 197 2.441615571 -1.779035525
## 198 198 1.871782923 1.046152590
## 199 199 2.674255862 0.109648976
## 200 200 -0.501265010 -1.130291701
## 201 201 -0.073160632 -2.355725495
## 202 202 -0.952370553 -0.672739875
## 203 203 -0.470042317 -1.142605439
## 204 204 1.701478707 1.743735702
## 205 205 2.315769420 0.221629829
## 206 206 1.857007414 0.243855826
## 207 207 1.128026078 0.907134865
## 208 208 0.425780735 -0.033402083
## 209 209 -2.944578714 -0.045684570
## 210 210 1.170228577 -2.161315276
## 211 211 -0.217860373 -0.100911231
## 212 212 -0.855541319 -0.828012784
## 213 213 -3.578846145 0.368145191
## 214 214 -3.209025444 0.249088410
## 215 215 -2.425283619 -0.674258165
## 216 216 -2.766251223 -0.242219266
## 217 217 2.388538177 -0.320028832
## 218 218 1.674209524 0.831354760
## 219 219 2.933863628 -0.160592562
## 220 220 1.217748972 0.472528970
## 221 221 2.394069223 0.348551651
## 222 222 -2.646745030 0.334069926
## 223 223 -1.490181056 -1.949799285
## 224 224 -2.808757418 -0.217989155
## 225 225 -2.548832633 -0.722312905
## 226 226 -0.919030446 -0.726717908
## 227 227 -0.527743209 -0.764276645
## 228 228 -0.541749039 -1.154587394
## 229 229 -0.459987243 -1.590085188
## 230 230 -3.841125055 0.678068718
## 231 231 -3.482381244 0.592515104
## 232 232 -3.916241287 0.314272443
## 233 233 -2.982271463 -0.543778787
## 234 234 1.782110783 0.975351359
## 235 235 0.964052016 0.066578304
## 236 236 1.889537409 -0.438336379
## 237 237 1.021756242 0.173328325
## 238 238 2.232054251 0.110334239
## 239 239 2.031289692 0.638606859
## 240 240 2.135551521 0.329318624
## 241 241 1.689871780 1.067622974
## 242 242 0.054358036 0.318371133
## 243 243 0.422013526 1.181085747
## 244 244 0.861541889 1.043634307
## 245 245 3.588802232 -0.984655383
## 246 246 2.336457820 1.224742779
## 247 247 2.845830179 -0.598574468
## 248 248 2.822832697 -0.090878496
## 249 249 2.645425486 0.558983949
## 250 250 -1.360368233 -0.492287138
## 251 251 -2.400837720 0.180956510
## 252 252 -2.239625689 0.403322384
## 253 253 -0.406524048 -1.586289570
## 254 254 -0.009290678 -1.104987101
## 255 255 -0.069585789 -0.317031001
## 256 256 0.987984783 0.297675820
## 257 257 -0.416707626 -0.878391156
## 258 258 -0.278422430 -0.889222321
## 259 259 -0.616240367 -0.390069171
## 260 260 0.003149358 -0.624777836
## 261 261 -0.564103539 -1.444156730
## 262 262 -1.116910804 -0.215347758
## 263 263 -2.255381955 0.314159460
## 264 264 -1.728056366 0.532614334
## 265 265 -2.331453333 0.978079501
## 266 266 -2.619748295 -0.156626136
## 267 267 2.044534816 0.242983468
## 268 268 1.011189551 0.851560308
## 269 269 1.221434835 0.792129693
## 270 270 1.732317528 0.916029635
## 271 271 0.770311458 0.471821452
## 272 272 0.756210442 -0.134174520
## 273 273 1.073175757 -0.524757222
## 274 274 1.020741819 0.588534663
## 275 275 0.356352560 -0.064588514
## 276 276 -0.848880590 0.446506608
## 277 277 0.578852577 0.201808844
## 278 278 -0.883929366 -0.346822865
## 279 279 2.039769421 0.885586492
## 280 280 2.039684589 0.201679617
## 281 281 -0.656986285 -0.143775406
## 282 282 0.151261125 -1.114538034
## 283 283 0.959820515 -0.462176478
## 284 284 -0.159564418 -1.206210185
## 285 285 -0.552423721 -0.614374209
## 286 286 -2.176012464 -0.671211582
## 287 287 -2.274744101 0.032253505
## 288 288 -2.777790627 -0.026962326
## 289 289 -2.242263131 -0.557383827
## 290 290 -2.970031534 -0.626452843
## 291 291 -2.807756619 -0.430322899
## 292 292 -1.769430215 -0.375808971
## 293 293 -2.862610870 0.148009493
## 294 294 1.997897472 1.211229448
## 295 295 2.292400943 0.852340269
## 296 296 2.099880742 1.233896072
## 297 297 1.259306622 0.501253703
## 298 298 0.779857741 -1.665475402
## 299 299 -1.822400255 -1.218564076
## 300 300 2.039487370 -2.910621360
## 301 301 -0.249226179 -2.639510459
## 302 302 1.835030878 1.428692923
## 303 303 2.055567456 0.904896132
## 304 304 2.579537701 -0.528159909
## 305 305 2.258278300 1.074375746
## 306 306 1.176298783 0.221127996
## 307 307 -0.142698777 1.690873425
## 308 308 -0.128799729 1.075076900
## 309 309 1.198444999 1.325468994
## 310 310 2.357264399 1.224369480
## 311 311 2.925022134 -0.209329276
## 312 312 2.041233636 0.652838352
## 313 313 2.555562085 0.546494322
## 314 314 1.204707737 0.044106923
## 315 315 1.264180757 -0.585673901
## 316 316 1.231935249 -1.346232877
## 317 317 -0.191955250 -1.434308262
## 318 318 2.065014627 0.637450008
## 319 319 1.246249926 0.430639776
## 320 320 1.848644377 -0.162736573
## 321 321 1.742671441 0.935759793
## 322 322 1.842401967 0.715419525
## 323 323 3.148709193 0.325374582
## 324 324 0.725241727 0.738917445
## 325 325 2.994023553 -0.276153528
## 326 326 3.620670118 -1.045406241
## 327 327 3.659701860 -1.804958980
## 328 328 1.990247342 -1.039235491
## 329 329 1.858715374 -1.894775159
## 330 330 2.700541901 1.680246780
## 332 332 2.418082752 -0.123063331
## 333 333 2.170123465 0.712823892
## 334 334 -0.262803842 1.753327783
## 335 335 1.175264270 1.504449971
## 336 336 1.649996406 0.798058457
## 338 338 2.068505110 0.118955020
## 339 339 1.309844814 0.331868595
## 340 340 1.212006085 0.021128728
## 341 341 0.855659749 0.677551958
## 342 342 -0.285498451 1.038622426
## 343 343 1.209863362 1.361029734
## 344 344 2.947889961 0.510276862
## 345 345 2.731767978 0.647143584
## 346 346 2.625202956 0.648523415
## 347 347 2.360752401 -0.078459536
## 348 348 2.889298034 -0.414596300
## 349 349 2.681776080 0.244474403
## 350 350 2.316960497 0.597892021
## 351 351 2.111826953 0.859059081
## 352 352 2.327243030 0.502173802
## 353 353 2.403745234 -1.207831436
## 354 354 1.839453187 1.088127700
## 356 356 1.866515610 1.040881110
## 357 357 1.978886879 0.175370984
## 358 358 1.330047720 0.845428861
## 359 359 1.912817671 -0.447813248
## 360 360 1.498152123 -1.433356189
## 361 361 1.105123449 -1.264683344
## 362 362 -0.323466448 1.070364780
## 363 363 -0.224889250 0.674425260
## 364 364 -0.598343702 -0.323809860
## 365 365 -1.154892525 -1.650721111
## 366 366 -0.013967541 -0.756703765
## 367 367 -0.476982242 -0.851766829
## 368 368 1.757552139 -0.902532315
## 369 369 1.582898254 -0.622131999
## 370 370 2.001291157 -0.125894108
## 371 371 1.600164441 0.288825497
## 372 372 1.415877989 0.279476565
## 373 373 1.283448248 -0.491797564
## 374 374 0.919033123 -0.091959555
## 376 376 2.215536519 0.897052838
## 377 377 2.677737284 -0.036438338
## 378 378 2.335393352 -0.011991945
## 379 379 2.304924576 1.057601816
## 380 380 2.428525096 0.191088749
## 381 381 1.818264948 1.140154424
## 382 382 1.997199211 1.082357900
## 383 383 2.180816521 0.206987533
## 384 384 2.431911002 1.046208258
## 385 385 2.199275134 0.631504291
## 386 386 2.543696683 0.647834179
## 387 387 0.047963139 -0.226239158
## 388 388 0.679179940 -0.228057884
## 389 389 0.894187220 0.663258635
## 390 390 -0.468292737 0.222508428
## 391 391 1.102643525 1.048370273
## 392 392 1.534745601 1.521939669
## 393 393 1.100511344 -0.337723248
## 394 394 1.093382718 0.260354197
## 395 395 3.814625197 -1.998282305
## 396 396 1.220113039 1.874706592
## 397 397 1.709209706 -0.632856514
## 398 398 1.869512308 -0.815606787
#ggplot
ggplot(data=pca.data, aes(x=X, y=Y, label=".")) +
geom_text() +
xlab(paste("PC1 - ", pc1.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", pc1.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("PCA Graph")
#PC1 accounts for 79.8% of variation in the original data #PC2 accounts
for 12.1% of variation in the original data
#eigen values
eigen_auto <- auto_pca$sdev^2
eigen_auto
## [1] 4.7882662 0.7286311 0.2584673 0.1251770 0.0631764 0.0362820
plot(log(eigen_auto), xlab = "Component number",ylab = "log(Component variance)", type="l",main = "Log(eigenvalue) diagram")
library(corrplot)
## corrplot 0.92 loaded
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_eig(auto_pca, addlabels = TRUE)
fviz_pca_var(auto_pca,col.var = "cos2",
gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
repel = TRUE)
# load library for factor analysis
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
data <- auto[,1:6]
#To answer question 1 we perform Parallel analysis Scree Plots
fa.parallel(data)
## Parallel analysis suggests that the number of factors = 2 and the number of components = 1
#Output:Parallel analysis suggests that the number of factors = 2 and the number of components = 1 #From the above two factors are ideal for the autompg dataset
#2.Explain the output for your factor model?
fit.pc <- principal(data, nfactors=2, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = data, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## mpg -0.88 -0.20 0.81 0.194 1.1
## cylinders 0.90 0.30 0.90 0.096 1.2
## displacement 0.91 0.35 0.95 0.049 1.3
## horsepower 0.79 0.55 0.92 0.077 1.8
## weight 0.95 0.20 0.95 0.055 1.1
## acceleration -0.23 -0.97 0.99 0.011 1.1
##
## RC1 RC2
## SS loadings 3.99 1.53
## Proportion Var 0.66 0.25
## Cumulative Var 0.66 0.92
## Proportion Explained 0.72 0.28
## Cumulative Proportion 0.72 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.03
## with the empirical chi square 11.49 with prob < 0.022
##
## Fit based upon off diagonal values = 1
#High absolute values (close to 1) indicate a strong relationship between the variable and the factor. #Example: For displacement variable, among the two factors (RC1,RC2), RC1 is having high absolute value so we can conclude that displacement variable is explained better by RC1. Like this we can analyze the relationship between the variables and the factors.
#h2 explains how much variance of the variables are explained by the factors. #Example: For displacement variable, 0.950 of it’s variance is explained by the factors
#u2 indicates the amount of variance not explained by the factors #Example: For displacement variable, 0.049 of it’s variance is unique and not explained by the factors.
#Principal Components Analysis Call: principal(r = data, nfactors = 2, rotate = “varimax”) Standardized loadings (pattern matrix) based upon correlation matrix
RC1 RC2
SS loadings 3.99 1.53 Proportion Var 0.66 0.25 Cumulative Var 0.66 0.92 Proportion Explained 0.72 0.28 Cumulative Proportion 0.72 1.00
Mean item complexity = 1.3 Test of the hypothesis that 2 components are sufficient.
The root mean square of the residuals (RMSR) is 0.03 with the empirical chi square 11.49 with prob < 0.022
Fit based upon off diagonal values = 1
round(fit.pc$values, 3)
## [1] 4.788 0.729 0.258 0.125 0.063 0.036
fit.pc$loadings
##
## Loadings:
## RC1 RC2
## mpg -0.875 -0.200
## cylinders 0.901 0.305
## displacement 0.911 0.348
## horsepower 0.788 0.549
## weight 0.952 0.199
## acceleration -0.234 -0.966
##
## RC1 RC2
## SS loadings 3.989 1.528
## Proportion Var 0.665 0.255
## Cumulative Var 0.665 0.919
# Communalities
fit.pc$communality
## mpg cylinders displacement horsepower weight acceleration
## 0.8058710 0.9039134 0.9505252 0.9225211 0.9454138 0.9886527
# Rotated factor scores, Notice the columns ordering: RC1, RC3, RC2 and RC4
fit.pc$scores
## RC1 RC2
## 1 0.65435709 1.0709019786
## 2 0.95687760 1.3651675618
## 3 0.57601151 1.5829765405
## 4 0.74236010 1.1806527547
## 5 0.49564776 1.6731947793
## 6 1.27550768 1.9512212960
## 7 1.30198023 2.4308345234
## 8 1.17976421 2.5910589447
## 9 1.46814560 2.0857362965
## 10 0.79270945 2.5705321468
## 11 0.80680969 1.9660488827
## 12 0.49035663 2.5958928486
## 13 0.81127916 1.9566624228
## 14 0.95602572 2.4217371617
## 15 -0.74623264 0.3730521011
## 16 0.05752349 -0.0679901333
## 17 0.19094803 -0.1300452894
## 18 0.04390855 -0.2780509789
## 19 -1.06596500 0.6379853067
## 20 -0.49213646 -1.7638490721
## 21 -0.38240732 -0.6363974539
## 22 -0.81200380 0.5063508122
## 23 -0.49064815 -0.5059429592
## 24 -1.12469867 1.4765982484
## 25 -0.04802161 0.1021149669
## 26 1.94936543 0.4567632859
## 27 1.82662767 0.0654405668
## 28 1.65316294 0.6938965191
## 29 2.41358922 -1.3575252990
## 30 -1.06596500 0.6379853067
## 31 -0.82452891 0.2695687432
## 32 -0.96340945 0.7929023622
## 34 -0.12979644 0.8434229066
## 35 0.59791425 -0.2934533650
## 36 0.56359699 -0.2831046735
## 37 0.45028798 -0.3130239661
## 38 0.46972821 -0.2461263539
## 39 1.25390086 1.0325439757
## 40 1.42648901 1.2058691529
## 41 1.39393120 0.4225645142
## 42 1.22712448 0.6047937827
## 43 1.66196493 1.0755148866
## 44 1.62214064 0.8978686494
## 45 1.78470658 0.8330960982
## 46 0.17821828 0.6202110264
## 47 -0.15174388 -1.2887331224
## 48 0.40870826 -0.0456018566
## 49 0.30032692 0.0664593233
## 50 -0.89288829 0.6863418583
## 51 -1.12034263 0.8521699077
## 52 -0.65849993 -1.2046233891
## 53 -1.25152217 0.6422302023
## 54 -0.90336671 -0.9582138742
## 55 -1.22661205 -0.4395992090
## 56 -0.68380131 -1.1038407665
## 57 -0.40288976 -1.6289284813
## 58 -0.71975398 0.2160657244
## 59 -0.69872319 -0.3636839674
## 60 0.17342187 -2.9655848868
## 61 0.02791885 -1.3916981782
## 62 -0.50395931 -0.2631694150
## 63 1.31600234 0.9936658936
## 64 1.45870464 1.0451187636
## 65 1.26722963 0.4370005208
## 66 1.32193926 0.6094120581
## 67 0.73409749 1.3238210359
## 68 1.68476301 1.4947217252
## 69 1.56674529 0.3265393021
## 70 1.59826817 0.3496104290
## 71 1.60815229 0.9349791946
## 72 -1.00503166 0.8760561604
## 73 1.01758289 0.8623324167
## 74 1.31746870 0.0867135161
## 75 1.65452892 -0.6154301275
## 76 1.34471830 0.2484131532
## 77 -0.31207942 0.3915405125
## 78 -0.27093197 -0.9218028575
## 79 0.15644374 -1.5282908120
## 80 -0.61654568 -0.7927404177
## 81 -0.53899443 -0.1024357412
## 82 -0.72112533 -0.2546894680
## 83 -0.69949784 0.5094687003
## 84 -1.04449730 0.4166648548
## 85 -0.82771764 -0.0767834257
## 86 1.39806408 0.7443710922
## 87 0.84582135 1.2561189543
## 88 1.28397834 0.5679026290
## 89 1.32649065 -0.0083620948
## 90 1.00544671 0.8879833357
## 91 1.80820829 1.1884758629
## 92 1.46678585 0.8322500270
## 93 1.46055219 0.5622216150
## 94 1.46834293 0.0276894952
## 95 1.69093957 1.5594754794
## 96 1.86866512 1.5474618186
## 97 1.06438323 1.5343881203
## 98 0.52666863 -0.5296699540
## 99 0.89348420 -1.1957453381
## 100 0.40097323 -0.3406301201
## 101 0.50491741 -0.6262272013
## 102 0.10986750 -0.2438132889
## 103 -0.38572272 -1.9732806921
## 104 1.99484326 -0.0689293690
## 105 1.77588158 0.6341203217
## 106 1.62082245 0.5689676766
## 107 1.53735331 0.8363506990
## 108 0.21644813 0.0596639906
## 109 -0.18641587 -1.1825431145
## 110 -0.05474952 -1.4901181332
## 111 -0.49553085 -0.2212356479
## 112 -1.06320298 0.8574475194
## 113 -0.15000342 -1.0584984678
## 114 -0.29991247 0.6334301207
## 115 -0.84520558 0.2338026227
## 116 1.24544742 0.5894495372
## 117 1.16177124 2.3950250562
## 118 -0.77532641 -1.3159291365
## 119 -0.80632511 0.1091086554
## 120 -0.64903576 0.5637294116
## 121 -0.24932025 0.0692702158
## 122 0.67358419 1.5245590235
## 123 -0.70712025 0.7623856898
## 124 -0.15907626 0.8094315088
## 125 1.06794582 1.5650359460
## 126 0.35974965 -0.5417745273
## 128 0.34690350 -0.3070216557
## 129 0.82804778 -0.8717185021
## 130 -0.81272210 -0.9908233101
## 131 -0.61836120 -0.2477074208
## 132 -0.66679145 -1.6737521247
## 133 -0.45481383 -0.5119021539
## 134 0.96100317 -0.9608168511
## 135 1.07082494 -1.2182172993
## 136 0.71484113 -0.6531295699
## 137 1.23456670 0.2129616411
## 138 1.75507648 -0.1181386251
## 139 1.42762446 0.3336323548
## 140 1.74885539 -0.6791841245
## 141 1.56900563 -0.3352710556
## 142 -0.86628710 -0.0958344236
## 143 -1.05855588 0.1570970690
## 144 -0.98745466 0.5045681054
## 145 -1.28244478 -0.1140206111
## 146 -0.83484351 -1.0233862254
## 147 -1.15191711 0.5747622544
## 148 -0.88459320 0.1276103144
## 149 -1.03444179 0.6738837371
## 150 -0.68081070 0.3557277525
## 151 -0.76714207 0.2203518599
## 152 -1.16818081 0.0800751176
## 153 0.45793952 -0.4306030220
## 154 0.65038135 -0.4396374304
## 155 1.29770721 -2.5313365839
## 156 1.00562099 -1.9208580506
## 157 1.41815422 1.1657177863
## 158 1.50723033 0.1384667224
## 159 1.49368358 0.0073303957
## 160 1.57443586 0.2622491071
## 161 1.45192364 -2.3419179879
## 162 1.20453497 -1.4975522689
## 163 1.27040839 -1.6265245882
## 164 1.06902672 -1.6673522262
## 165 0.22181548 0.1330525945
## 166 0.50906211 0.5192530874
## 167 0.69868894 1.0364542055
## 168 -0.96835045 0.0424608186
## 169 -0.32423227 -0.5268246660
## 170 0.31463425 -0.2877164395
## 171 -0.16679209 -1.0908488431
## 172 -0.75716450 0.8339757261
## 173 -0.76249040 -0.2671045466
## 174 -0.41194621 -0.3803922827
## 175 0.08271044 0.1848199351
## 176 -1.33536699 0.7909839147
## 177 0.56657822 -0.8141506604
## 178 -0.58128820 0.2692258620
## 179 -0.23622968 -0.5679454888
## 180 -0.48970487 0.3833313019
## 181 -0.79070980 0.9968624790
## 182 -1.13972987 -0.4632871041
## 183 -0.83260448 0.1996637012
## 184 -0.63079876 -0.3485750153
## 185 -0.66517241 0.3549061680
## 186 -0.60046170 -0.6332246025
## 187 -0.94131479 0.2959876814
## 188 1.08897090 0.5887048219
## 189 1.18859356 0.6263406910
## 190 1.12928994 0.1458698828
## 191 1.30886344 0.6645259717
## 192 0.27131783 -0.1043374383
## 193 0.27358400 0.2189536029
## 194 0.28500914 -0.9221097416
## 195 0.46295958 -0.9202362193
## 196 -0.32815202 -2.3166230136
## 197 -0.07514202 -2.3628564578
## 198 -1.31039486 0.7187542389
## 199 -1.15221594 -0.4271722662
## 200 0.79285002 -1.0850039174
## 201 1.25454324 -2.4583525374
## 202 0.73974617 -0.5131616680
## 203 0.78646413 -1.1042630239
## 204 -1.60327420 1.4856551522
## 205 -1.06361231 -0.2369134691
## 206 -0.88728433 -0.1205509466
## 207 -0.93353025 0.7236245798
## 208 -0.15701112 -0.1214078971
## 209 1.22967345 0.5491424783
## 210 0.64426400 -2.5063843853
## 211 0.14167958 -0.0617649560
## 212 0.78080644 -0.6958120329
## 213 1.27430998 1.1122243402
## 214 1.18474281 0.9122384179
## 215 1.34375344 -0.2160766873
## 216 1.25880604 0.3066465982
## 217 -0.81184270 -0.8203367786
## 218 -1.11782175 0.5333101842
## 219 -1.11805577 -0.7635323109
## 220 -0.74435348 0.2491539875
## 221 -1.16165728 -0.1195407145
## 222 0.91028994 0.8874374764
## 223 1.62385759 -1.7448407807
## 224 1.26361848 0.3407043392
## 225 1.41933219 -0.2414741543
## 226 0.75415154 -0.5765919319
## 227 0.61342767 -0.6953691138
## 228 0.82205956 -1.1023016602
## 229 1.01496036 -1.5760942639
## 230 1.22061580 1.4907866803
## 231 1.11816906 1.3282206629
## 232 1.44049183 1.1240827249
## 233 1.50403537 0.0338553579
## 234 -1.23686573 0.6626064913
## 235 -0.42942816 -0.1255934851
## 236 -0.54598150 -0.8433555093
## 237 -0.50855245 -0.0252219565
## 238 -0.97147255 -0.3367826911
## 239 -1.16386417 0.2585422478
## 240 -1.04578579 -0.0873102395
## 241 -1.24705581 0.7781833390
## 242 -0.18776127 0.3232231670
## 243 -0.78679767 1.1544013938
## 244 -0.89535084 0.9209682062
## 245 -0.95790582 -1.7614942509
## 246 -1.59353505 0.8120221009
## 247 -0.85432557 -1.2055016180
## 248 -1.10882366 -0.6678271417
## 249 -1.37398732 0.0504139807
## 250 0.81303281 -0.2409768550
## 251 0.88917408 0.6768240619
## 252 0.70755833 0.8775869229
## 253 0.99109192 -1.5829507135
## 254 0.57821232 -1.1582006757
## 255 0.19330101 -0.3187283110
## 256 -0.55936144 0.1121740440
## 257 0.62727442 -0.8376895943
## 258 0.57627137 -0.8771024577
## 259 0.45514593 -0.2845571229
## 260 0.32348958 -0.6565695244
## 261 0.98174198 -1.4017766111
## 262 0.56936516 0.0004028746
## 263 0.76036088 0.7871731754
## 264 0.43083975 0.9095891017
## 265 0.44638848 1.4996238472
## 266 1.15431308 0.3667995958
## 267 -0.96363097 -0.1594938213
## 268 -0.85679150 0.6889711731
## 269 -0.91200160 0.5839434161
## 270 -1.18563604 0.6104239630
## 271 -0.56074176 0.3391430764
## 272 -0.23995086 -0.2942100209
## 273 -0.16672379 -0.7685427833
## 274 -0.72397445 0.4108933619
## 275 -0.11236575 -0.1400706419
## 276 0.11554292 0.6409075612
## 277 -0.34197040 0.0944911815
## 278 0.54229470 -0.1848721646
## 279 -1.29572482 0.5161174032
## 280 -0.94017355 -0.2018736559
## 281 0.34380172 -0.0177200024
## 282 0.51742469 -1.2007847190
## 283 -0.15283165 -0.6798552956
## 284 0.69237453 -1.2339983280
## 285 0.54561130 -0.5329874080
## 286 1.24008302 -0.2634256810
## 287 0.91483415 0.4951369301
## 288 1.15163450 0.5349767851
## 289 1.20804409 -0.1304877907
## 290 1.54199922 -0.0554230689
## 291 1.37358644 0.1175801363
## 292 0.92001133 -0.0357407310
## 293 1.09541594 0.7358731022
## 294 -1.44785611 0.8664886100
## 295 -1.38190530 0.4299845309
## 296 -1.50140525 0.8696051687
## 297 -0.77630506 0.2708839037
## 298 0.54638347 -1.9066607987
## 299 1.37979561 -0.9097764859
## 300 0.67778017 -3.4693221124
## 301 1.47416971 -2.7205849920
## 302 -1.49420002 1.1278216554
## 303 -1.31223253 0.5331863285
## 304 -0.78187166 -1.0775769153
## 305 -1.48335178 0.6700107058
## 306 -0.59669172 -0.0063771805
## 307 -0.82052877 1.8041214183
## 308 -0.50611039 1.1548012023
## 309 -1.17983304 1.1485383154
## 310 -1.60186214 0.8074110070
## 311 -1.08909993 -0.8129062765
## 312 -1.17533460 0.2714669170
## 313 -1.33069208 0.0555241900
## 314 -0.51630516 -0.1979857876
## 315 -0.21328171 -0.8712291370
## 316 0.19528687 -1.6631727935
## 317 0.82421252 -1.4669017230
## 318 -1.17707454 0.2504889332
## 319 -0.73425045 0.1993966336
## 320 -0.67249967 -0.5457212412
## 321 -1.20013275 0.6290383405
## 322 -1.12643660 0.3774878606
## 323 -1.45866496 -0.2969001996
## 324 -0.68112886 0.6286964651
## 325 -1.08262145 -0.8970547466
## 326 -0.93937679 -1.8317365119
## 327 -0.56052223 -2.6370774388
## 328 -0.27485988 -1.4946390984
## 329 0.22374454 -2.3661660119
## 330 -1.97942779 1.2164093426
## 332 -0.92633134 -0.6195411443
## 333 -1.25930264 0.3083070673
## 334 -0.80380660 1.8940449546
## 335 -1.26337959 1.3411443668
## 336 -1.09059701 0.5032636040
## 338 -0.90897386 -0.2945673763
## 339 -0.70895075 0.0828046191
## 340 -0.50734934 -0.2235897030
## 341 -0.70264063 0.5378248207
## 342 -0.42298568 1.1483045566
## 343 -1.20299493 1.1835567782
## 344 -1.47253937 -0.0620558567
## 345 -1.45517627 0.1254608577
## 346 -1.41225083 0.1485188664
## 347 -0.92603869 -0.5610877750
## 348 -0.96776513 -1.0211642783
## 349 -1.22538231 -0.2871490490
## 350 -1.25969315 0.1578686125
## 351 -1.31144553 0.4736553577
## 352 -1.21414690 0.0552925292
## 353 -0.35656261 -1.7554912479
## 354 -1.31897451 0.7693781979
## 356 -1.30549740 0.7142880209
## 357 -0.90159835 -0.2171654787
## 358 -0.98418961 0.6178755761
## 359 -0.55058934 -0.8580257178
## 360 0.13154966 -1.8086240607
## 361 0.20482944 -1.5518419476
## 362 -0.42393692 1.1893288613
## 363 -0.25848636 0.7536571647
## 364 0.41337282 -0.2186230550
## 365 1.33107291 -1.4988399738
## 366 0.39907894 -0.7916026414
## 367 0.63811916 -0.7975151543
## 368 -0.25062435 -1.3039333480
## 369 -0.32485745 -0.9741350259
## 370 -0.75416665 -0.5379956011
## 371 -0.80547331 -0.0212560620
## 372 -0.72514062 0.0062986581
## 373 -0.26997243 -0.7765789646
## 374 -0.32857807 -0.2829074751
## 376 -1.37366920 0.4925132402
## 377 -1.07770089 -0.5812498427
## 378 -0.95020509 -0.4861635987
## 379 -1.49373571 0.6429413992
## 380 -1.09391414 -0.2918421298
## 381 -1.33734220 0.8282956856
## 382 -1.38057862 0.7313327831
## 383 -1.00073206 -0.2249199661
## 384 -1.53981910 0.6052292779
## 385 -1.22896892 0.2170212495
## 386 -1.37851240 0.1643231979
## 387 0.09796344 -0.2472460540
## 388 -0.15960017 -0.3771543228
## 389 -0.71098908 0.5150061511
## 390 0.07611804 0.3285642107
## 391 -0.99655371 0.8770494580
## 392 -1.41969356 1.2866101161
## 393 -0.27514503 -0.5777259478
## 394 -0.58312523 0.0516187547
## 395 -0.52347405 -2.8714559004
## 396 -1.47421815 1.7207684678
## 397 -0.37101220 -1.0110078724
## 398 -0.34166326 -1.2353768952
fa.plot(fit.pc) # See Correlations within Factors
#3.Show the columns that go into each factor?
fa.diagram(fit.pc) # Visualize the relationship
#From the above we can infer that weight,displacement,cylinders,mpg are well explained by RC1 and acceleration is explained better by RC2.
#4.Perform some visualizations using the factors?
#very simple structure visualization
vss(data)
##
## Very Simple Structure
## Call: vss(x = data)
## VSS complexity 1 achieves a maximimum of 0.97 with 1 factors
## VSS complexity 2 achieves a maximimum of 0.99 with 2 factors
##
## The Velicer MAP achieves a minimum of 0.14 with 1 factors
## BIC achieves a minimum of 97.18 with 2 factors
## Sample Size adjusted BIC achieves a minimum of 109.87 with 2 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex
## 1 0.97 0.00 0.14 9 3.7e+02 8.4e-74 0.71 0.97 0.32 314 343 1.0
## 2 0.86 0.99 0.16 4 1.2e+02 3.2e-25 0.12 0.99 0.27 97 110 1.3
## 3 0.64 0.99 0.29 0 9.8e-01 NA 0.22 0.99 NA NA NA 1.5
## 4 0.83 0.99 0.49 -3 3.2e-07 NA 0.19 0.99 NA NA NA 1.5
## 5 0.83 0.99 1.00 -5 3.5e-08 NA 0.19 0.99 NA NA NA 1.5
## 6 0.83 0.99 NA -6 3.5e-08 NA 0.19 0.99 NA NA NA 1.5
## eChisq SRMR eCRMS eBIC
## 1 3.7e+01 5.6e-02 0.072 -17
## 2 2.0e+00 1.3e-02 0.025 -22
## 3 8.8e-03 8.7e-04 NA NA
## 4 2.8e-09 4.8e-07 NA NA
## 5 3.3e-10 1.7e-07 NA NA
## 6 3.3e-10 1.7e-07 NA NA
#output Very Simple Structure Call: vss(x = data) VSS complexity 1 achieves a maximimum of 0.97 with 1 factors VSS complexity 2 achieves a maximimum of 0.99 with 2 factors
The Velicer MAP achieves a minimum of 0.14 with 1 factors BIC achieves a minimum of 97.18 with 2 factors Sample Size adjusted BIC achieves a minimum of 109.87 with 2 factors
Statistics by number of factors
# Computing Correlation Matrix
corrm.auto <- cor(data)
corrm.auto
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## acceleration
## mpg 0.4233285
## cylinders -0.5046834
## displacement -0.5438005
## horsepower -0.6891955
## weight -0.4168392
## acceleration 1.0000000
plot(corrm.auto)
auto_pca <- prcomp(data, scale=TRUE)
summary(auto_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.188 0.8536 0.50840 0.35380 0.25135 0.19048
## Proportion of Variance 0.798 0.1214 0.04308 0.02086 0.01053 0.00605
## Cumulative Proportion 0.798 0.9195 0.96256 0.98342 0.99395 1.00000
plot(auto_pca)
#Biplot Visualization
biplot(fit.pc)
### Clustering
#load necessary libraries
library(magrittr)
library(NbClust)
library(cluster)
library(factoextra)
#Hierarchical Clustering- Dendrogram
auto_featured <- auto[, c("mpg","horsepower", "weight", "acceleration")]
auto_clust <- auto_featured
auto_mpg_scaled <- scale(auto_clust)
dist_matrix <- dist(auto_mpg_scaled)
#Default Clustering
hc <- hclust(dist_matrix)
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")
#Average Clustering
hc <- hclust(dist_matrix,method = "average")
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")
#By observing the above dendrogram’s k=2 clusters will be sufficient.This is confirmed further with D index graphical representation.
num_clusters <- 2
clusters <- cutree(hc, k = num_clusters)
# Membership for each cluster
table(clusters)
## clusters
## 1 2
## 100 292
# Visualize cluster and membership using first two Principal Components
pca_result <- prcomp(auto_clust,scale=TRUE)
fviz_cluster(list(data = pca_result$x[,1:2], cluster = clusters))
#Non-Hierarchical Clustering(k-means)
num_clusters <- 2
kmeans_model <- kmeans(auto_clust, centers = num_clusters)
# Membership for each cluster
table(kmeans_model$cluster)
##
## 1 2
## 236 156
# Visualize cluster centers for k-means
fviz_cluster(kmeans_model, data = auto_clust, geom = "point", frame.type = "convex",
pointsize = 2, fill = "white", main = "K-means Cluster Centers")
## Warning: argument frame is deprecated; please use ellipse instead.
## Warning: argument frame.type is deprecated; please use ellipse.type instead.
# Visualize cluster and membership using first two Principal Components for k-means
pca_result <- prcomp(auto_clust, scale = TRUE)
fviz_cluster(kmeans_model, data = pca_result$x[, 1:2], geom = "point",
pointsize = 2, fill = "white", main = "K-means Clustering Result (PCA)")
library(cluster)
library(factoextra)
# Calculate silhouette information for k-means clustering
sil <- silhouette(kmeans_model$cluster, dist(auto_clust))
# Visualize the silhouette plot for k-means clustering
fviz_silhouette(sil, main = "Silhouette Plot for K-means Clustering")
## cluster size ave.sil.width
## 1 1 236 0.69
## 2 2 156 0.57
# Create a data frame with cluster membership
data_clustered <- cbind(auto_clust, Cluster = kmeans_model$cluster)
# Scatter plot of data points colored by cluster membership
plot(data_clustered$mpg, data_clustered$horsepower,
col = data_clustered$Cluster, pch = 16,
xlab = "mpg", ylab = "horsepower",
main = "Scatter Plot of Clustering")
res.hc <- auto_featured %>% scale() %>% dist(method = "euclidean") %>%
hclust(method = "ward.D2")
fviz_dend(res.hc,
cex = 0.5)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
res.nbclust <- auto_featured %>% scale() %>% NbClust(distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index ="all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 11 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
#The D index is a graphical method of determining the number of clusters.
###1.Model Development
# Performing multiple regression on the dataset
fit <- lm(mpg~ cylinders+displacement+horsepower+weight+acceleration, data=auto)
#show the results
summary(fit)
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5816 -2.8618 -0.3404 2.2438 16.3416
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.626e+01 2.669e+00 17.331 <2e-16 ***
## cylinders -3.979e-01 4.105e-01 -0.969 0.3330
## displacement -8.313e-05 9.072e-03 -0.009 0.9927
## horsepower -4.526e-02 1.666e-02 -2.716 0.0069 **
## weight -5.187e-03 8.167e-04 -6.351 6e-10 ***
## acceleration -2.910e-02 1.258e-01 -0.231 0.8171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.247 on 386 degrees of freedom
## Multiple R-squared: 0.7077, Adjusted R-squared: 0.7039
## F-statistic: 186.9 on 5 and 386 DF, p-value: < 2.2e-16
###In this step, we loaded the auto-mpg dataset and fitted a multiple regression model using the lm() function. The model predicts the “mpg” (miles per gallon) based on several predictor variables: cylinders, displacement, horsepower, weight, acceleration.
###2.Model Acceptance ###From the summary Pr column it is evident that weight and horsepower are the two most significant variables. R-squared value explains the fact that cylinders,displacement,horsepower,weight, acceleration variables contribute to the 70% of total variance,which is not bad. p-value< 2.2e-16 is significant so the model is acceptable.
coefficients(fit)
## (Intercept) cylinders displacement horsepower weight
## 4.626431e+01 -3.979284e-01 -8.313012e-05 -4.525708e-02 -5.186917e-03
## acceleration
## -2.910471e-02
###From the above we get information about the dependent variable mpg in equation form y=b0+ b1x1 + b2x2+…+bnxn where intercept b0=4.626431e+01, and cofficients b1=-3.979284e-01,….
confint(fit,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 41.015911401 51.512704304
## cylinders -1.205111478 0.409254760
## displacement -0.017919421 0.017753161
## horsepower -0.078016480 -0.012497680
## weight -0.006792548 -0.003581286
## acceleration -0.276367486 0.218158058
fitted(fit)
## 1 2 3 4 5 6 7 8
## 18.647725 16.094377 18.123484 18.111104 18.524507 11.276861 10.240802 10.700654
## 9 10 11 12 13 14 15 16
## 9.617057 14.232593 16.583305 16.859061 16.474576 16.562340 27.623839 24.415195
## 17 18 19 20 21 22 23 24
## 24.630626 26.129029 29.211755 32.468065 26.357308 27.564335 27.536265 27.597103
## 25 26 27 28 29 30 31 32
## 25.615530 8.975592 10.869423 10.428473 9.238063 29.211755 28.393515 28.399860
## 34 35 36 37 38 39 40 41
## 25.291042 20.817108 21.611876 22.295008 21.826036 13.403376 11.638537 14.188001
## 42 43 44 45 46 47 48 49
## 14.641909 8.866888 10.387559 8.117628 23.120448 28.359360 21.870214 23.169580
## 50 51 52 53 54 55 56 57
## 28.847921 29.170523 30.172823 30.092738 31.975588 32.653488 31.883310 30.759964
## 58 59 60 61 62 63 64 65
## 28.096857 29.521756 29.845376 27.530180 28.744038 13.066226 12.033751 14.425067
## 66 67 68 69 70 71 72 73
## 14.332226 16.885983 9.280606 12.292522 12.304835 11.148426 28.196336 15.715756
## 74 75 76 77 78 79 80 81
## 15.508486 13.981486 14.711356 23.958496 27.674763 24.705884 29.663828 27.882001
## 82 83 84 85 86 87 88 89
## 28.138432 26.852249 29.382821 29.309154 13.487074 16.885983 15.425721 15.468017
## 90 91 92 93 94 95 96 97
## 16.311088 8.063997 12.755411 12.892202 13.866896 8.433826 6.859634 14.991602
## 98 99 100 101 102 103 104 105
## 22.437443 21.803647 23.590596 23.723427 24.032372 31.857017 9.932575 9.678871
## 106 107 108 109 110 111 112 113
## 10.838976 11.205761 24.428860 28.307934 28.381116 27.589547 29.581640 28.295384
## 114 115 116 117 118 119 120 121
## 25.791819 28.391820 14.938151 10.172373 32.197828 29.624180 26.744637 24.266541
## 122 123 124 125 126 128 129 130
## 18.315400 25.479591 23.389815 15.580494 22.990810 23.818821 21.531911 30.966324
## 131 132 133 134 135 136 137 138
## 27.848524 31.590603 27.586751 19.223733 19.514243 19.885480 14.833294 11.467880
## 139 140 141 142 143 144 145 146
## 12.754879 12.197186 13.735217 28.918113 31.000761 28.782550 33.279454 30.962628
## 147 148 149 150 151 152 153 154
## 29.826614 29.885687 29.211388 26.925874 27.601665 30.794293 22.162837 20.696739
## 155 156 157 158 159 160 161 162
## 22.184746 23.649618 10.806691 13.052130 12.513111 11.805267 18.002771 18.352108
## 163 164 165 166 167 168 169 170
## 18.976820 19.373139 22.679643 20.980848 20.431015 29.543777 26.721564 23.751391
## 171 172 173 174 175 176 177 178
## 27.147977 25.908811 29.441115 26.577280 23.572806 31.042592 22.634343 25.953486
## 179 180 181 182 183 184 185 186
## 24.847501 24.529852 25.210802 32.446555 27.539903 28.990302 26.722893 28.877486
## 187 188 189 190 191 192 193 194
## 29.040966 14.478317 14.154339 16.669638 13.937229 22.114809 21.290210 24.059050
## 195 196 197 198 199 200 201 202
## 23.270431 31.110659 30.081320 31.036771 32.449466 19.879737 21.176661 19.499866
## 203 204 205 206 207 208 209 210
## 22.475977 31.630076 30.680787 29.615126 27.702179 23.249832 15.445247 23.081383
## 211 212 213 214 215 216 217 218
## 23.327214 18.131849 11.834646 15.107302 16.662414 16.381543 30.441283 29.434244
## 219 220 221 222 223 224 225 226
## 32.033645 27.936740 30.920019 16.004201 16.469114 14.619597 14.460885 20.142410
## 227 228 229 230 231 232 233 234
## 20.848480 19.988662 20.583888 12.689501 13.422777 11.660289 13.401092 30.649841
## 235 236 237 238 239 240 241 242
## 25.999590 28.992176 25.883264 30.680104 29.682492 30.858958 29.364753 24.451473
## 243 244 245 246 247 248 249 250
## 25.825731 25.584265 31.570991 31.921921 31.452080 30.219266 32.135836 20.175888
## 251 252 253 254 255 256 257 258
## 16.961136 17.875206 20.210978 22.666259 24.174195 26.121705 21.040598 22.633709
## 259 260 261 262 263 264 265 266
## 21.113906 23.603375 19.558856 20.297571 18.343876 18.131183 19.814999 15.157097
## 267 268 269 270 271 272 273 274
## 29.928932 26.670237 27.915016 29.280741 26.886185 25.183496 25.492298 27.364569
## 275 276 277 278 279 280 281 282
## 24.460556 21.523308 24.503593 19.696753 30.696317 30.029759 21.373211 23.974671
## 283 284 285 286 287 288 289 290
## 25.184630 22.319323 20.968574 16.806131 17.506342 15.907785 16.636455 12.988318
## 291 292 293 294 295 296 297 298
## 15.181237 18.266142 15.447576 31.059661 31.037182 30.691827 26.756329 21.879835
## 299 300 301 302 303 304 305 306
## 16.659251 24.179557 20.600748 29.700470 29.910338 30.687435 30.066318 26.272160
## 307 308 309 310 311 312 313 314
## 24.868858 24.277664 26.944962 29.676319 31.194749 30.049064 30.774032 26.216112
## 315 316 317 318 319 320 321 322
## 25.265085 24.425587 21.703671 29.325649 26.075462 27.573861 27.437523 29.078576
## 323 324 325 326 327 328 329 330
## 30.258365 24.965156 30.220612 31.046478 29.691539 25.351793 24.136269 31.635363
## 332 333 334 335 336 338 339 340
## 29.982484 31.844092 22.463114 27.622847 27.273055 29.032369 27.487409 26.713603
## 341 342 343 344 345 346 347 348
## 26.487143 24.383008 28.113528 32.446207 31.566205 32.352876 30.403258 30.915025
## 349 350 351 352 353 354 356 357
## 30.722565 30.825842 29.889988 30.643995 28.775407 29.542206 29.387223 28.591120
## 358 359 360 361 362 363 364 365
## 26.142455 27.113451 23.692827 23.464035 23.204171 22.834438 20.706079 18.425535
## 366 367 368 369 370 371 372 373
## 23.507831 21.555375 26.598289 26.445851 27.734109 26.988623 27.297135 25.876801
## 374 376 377 378 379 380 381 382
## 25.159469 30.599443 30.554335 30.857078 30.362631 29.970741 29.054236 29.410247
## 383 384 385 386 387 388 389 390
## 29.359122 31.003942 30.983569 30.813409 23.130623 23.874770 26.665775 23.655909
## 391 392 393 394 395 396 397 398
## 26.088254 28.188422 24.781987 25.843314 30.547053 28.618187 26.930304 26.278575
###3.Residual Analysis
plot(fit, which=1) # Residuals vs Fitted
plot(fit, which=2) # Normal Q-Q plot
residuals <- residuals(fit)
#Plot residuals against fitted values to check for homoscedasticity
plot_resid_fitted <- ggplot() +
geom_point(aes(x = fitted(fit), y = residuals)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals",
title = "Residuals vs Fitted Values Plot") +
theme_minimal()
print(plot_resid_fitted)
#The residual vs. fitted plot is a tool used to evaluate the assumptions and adequacy of a regression model. It helps to identify whether the model adequately captures the underlying relationships in the data or if there are issues that need to be addressed. #The plot shows a pattern of points around zero, the model is not appropriate.
###4.Prediction
predict.lm(fit, data.frame(cylinders=8, displacement=310, horsepower=170, weight=4300,
acceleration=9) )
## 1
## 12.79572
##Here when random values are given to the independent variables cylinders,displacement, horsepower, weight and acceleration, the model predicted the dependent variable mpg would be 12.79.
###5.Model Accuracy
#Make predictions using the model
predicted <- predict(fit, newdata = auto)
#Calculating RMSE by taking the square root of the mean of the squared differences between the actual values (auto$mpg) and the predicted values (predicted)
rmse <- sqrt(mean((auto$mpg - predicted)^2))
rmse
## [1] 4.214427
#RMSE measures the average deviation of predicted from actual values.
#Conclusion: Here the moderate RMSE value 4.21 suggests that the model is having resonable accuracy but with noticeable deviations from actual values.
###Some Visualizations
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
#Nonlinearity
# component + residual plot
crPlots(fit)
# plot studentized residuals vs. fitted values
library(car)
spreadLevelPlot(fit)
##
## Suggested power transformation: 0.2920186
#Load required Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
###1.Model Development
# Create binary outcome variable: high mpg (1) vs. low mpg (0)
autompg <- auto %>%
mutate(new_mpg = ifelse(mpg > median(mpg), 1, 0))
# Performing logistic regression on the dataset
logistic_simple <- glm(new_mpg ~ cylinders + displacement + horsepower + weight + acceleration, data = autompg, family = binomial)
#Considering the dependent variable mpg in binary format new_mpg and building the model with the independent variables namely cylinders,displacement,horsepower,weight and acceleration.
###2.Model Acceptance
summary(logistic_simple)
##
## Call:
## glm(formula = new_mpg ~ cylinders + displacement + horsepower +
## weight + acceleration, family = binomial, data = autompg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.2653088 2.7984397 4.740 2.13e-06 ***
## cylinders -0.0126533 0.3474005 -0.036 0.9709
## displacement -0.0141728 0.0084399 -1.679 0.0931 .
## horsepower -0.0524776 0.0207838 -2.525 0.0116 *
## weight -0.0015631 0.0008848 -1.767 0.0773 .
## acceleration -0.0857152 0.1258312 -0.681 0.4958
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 543.43 on 391 degrees of freedom
## Residual deviance: 206.80 on 386 degrees of freedom
## AIC: 218.8
##
## Number of Fisher Scoring iterations: 7
#The estimate column gives the coefficients that represent the log odds of the outcome variable (new_mpg) for one-unit increase in the predictor variable, holding other variables constant.
#For example: The coefficient for horsepower(-0.05) indicates the change in the outcome variable(new_mpg) for one unit increase in horsepower, holding other variables constant. And from the above it is evident that horsepower is the only one variable which is statistically significant. Other variables may not have a significant effect on mpg.
#The null deviance (543.43) represents the deviance when only the intercept is included in the model.
#The residual deviance (206.80) represents the deviance after fitting the logistic regression model with the predictor variables. Here the model is moderately acceptable.Generally,Smaller residual deviance indicates good fit.
###3.Residual Analysis
residuals(logistic_simple)
## 1 2 3 4 5 6
## -0.147549814 -0.038372615 -0.089048019 -0.094419660 -0.130982045 -0.005927762
## 7 8 9 10 11 12
## -0.002880412 -0.003828951 -0.002273463 -0.015088602 -0.031445293 -0.058254154
## 13 14 15 16 17 18
## -0.041229162 -0.006474167 0.597025822 -1.029181548 -1.018462839 -1.386380500
## 19 20 21 22 23 24
## 0.369585795 0.128003520 0.659966825 0.519687336 0.622457087 0.789385187
## 25 26 27 28 29 30
## -1.272630200 -0.004207932 -0.010486186 -0.007919461 -0.008387601 0.369585795
## 31 32 34 35 36 37
## 0.595473578 0.516974527 -0.943731579 -0.455432589 -0.473037268 -0.645710151
## 38 39 40 41 42 43
## -0.549386691 -0.025096571 -0.011338615 -0.033419832 -0.048828725 -0.007642675
## 44 45 46 47 48 49
## -0.010151081 -0.006543392 -0.497511288 -2.087826117 -0.499710740 -0.751042769
## 50 51 52 53 54 55
## 0.436647956 0.431109484 0.242942500 0.242915781 0.156427477 0.147975657
## 56 57 58 59 60 61
## 0.172887602 0.251435164 0.569127070 0.334668034 0.247739136 -1.653134938
## 62 63 64 65 66 67
## -2.095976451 -0.023853641 -0.011805056 -0.046360053 -0.034816994 -0.080053627
## 68 69 70 71 72 73
## -0.003477056 -0.024333111 -0.022122805 -0.007573151 -2.207771779 -0.064597367
## 74 75 76 77 78 79
## -0.085286250 -0.053554804 -0.047480884 -1.091580743 -2.081060697 -1.449613021
## 80 81 82 83 84 85
## 0.273329223 -2.001207467 0.508980270 0.705592486 0.317996038 0.392464215
## 86 87 88 89 90 91
## -0.020139933 -0.080053627 -0.048287063 -0.075212309 -0.063998135 -0.003448105
## 92 93 94 95 96 97
## -0.021385528 -0.025434212 -0.041014225 -0.002471548 -0.001443892 -0.025421193
## 98 99 100 101 102 103
## -0.552492122 -0.443768556 -0.687712784 -0.755311831 1.401560978 0.143001924
## 104 105 106 107 108 109
## -0.012941762 -0.009486014 -0.013874771 -0.013211958 -0.794327186 -2.075675772
## 110 111 112 113 114 115
## -2.074248960 -1.896844149 -2.482725079 -1.988682389 -1.313072665 0.451162871
## 116 117 118 119 120 121
## -0.044868503 -0.003374278 0.110823007 0.322083183 -1.883353302 -1.098123282
## 122 123 124 125 126 128
## -0.091655305 1.025137573 -0.802210007 -0.027057082 -0.838202707 -0.709023402
## 129 130 131 132 133 134
## -0.442724371 0.199967358 0.494024473 0.178860137 0.536919192 -0.316485503
## 135 136 137 138 139 140
## -0.248790158 -0.383763444 -0.065747315 -0.022787555 -0.036049007 -0.040935392
## 141 142 143 144 145 146
## -0.042718064 0.380877723 0.174082052 0.325934017 0.094143016 0.183330333
## 147 148 149 150 151 152
## 0.251392756 0.258814484 0.323495911 0.710692231 0.569341260 0.182997881
## 153 154 155 156 157 158
## -0.646736255 -0.370882729 -0.695227949 -0.883264242 -0.011022836 -0.032498585
## 159 160 161 162 163 164
## -0.033448268 -0.025720066 -0.214102481 -0.239020500 -0.221138732 -0.391536169
## 165 166 167 168 169 170
## -0.529674950 -0.394595917 -0.203406271 0.291237938 0.696565772 -0.702675236
## 171 172 173 174 175 176
## 0.634725303 0.827092327 0.265980109 0.790697213 -1.080707770 0.186980209
## 177 178 179 180 181 182
## -0.694866051 0.757204872 0.855560938 -1.400522137 1.121659717 0.125643724
## 183 184 185 186 187 188
## 0.502665867 0.416498652 0.756419570 0.371655191 0.365219691 -0.063409829
## 189 190 191 192 193 194
## -0.045371857 -0.126362137 -0.033707168 -0.600701853 -0.427229872 1.225594576
## 195 196 197 198 199 200
## -0.740899984 0.172766191 0.256215017 0.188575636 0.125108499 -0.402705476
## 201 202 203 204 205 206
## -0.544536642 -0.280940529 -0.511126609 0.171207313 0.213626226 0.292563322
## 207 208 209 210 211 212
## 0.441970645 -1.073215972 -0.054685366 -1.138308888 -0.936261217 -0.328643646
## 213 214 215 216 217 218
## -0.014750348 -0.047830412 -0.101130912 -0.061057010 0.246958109 0.342580990
## 219 220 221 222 223 224
## 0.141129892 0.627829741 0.204580263 -0.073810474 -0.166782513 -0.052199916
## 225 226 227 228 229 230
## -0.072904431 -0.306547744 -0.417336469 -0.409100386 -0.372144669 -0.012241884
## 231 232 233 234 235 236
## -0.023374629 -0.008275317 -0.030870582 0.247397896 0.860744615 0.343008668
## 237 238 239 240 241 242
## 0.826281742 0.204553836 0.333127452 0.208759707 0.294718484 -1.345429300
## 243 244 245 246 247 248
## -1.425220751 -1.580992972 0.150541290 0.163017575 0.140404012 0.243138960
## 249 250 251 252 253 254
## 0.144495577 -0.330191657 -0.083405130 -0.110915016 -0.349305743 -0.750761785
## 255 256 257 258 259 260
## -1.137437817 0.777780805 -0.484510252 -0.690069708 -0.451441028 -1.038434932
## 261 262 263 264 265 266
## -0.306676803 -0.257612256 -0.102154885 -0.100946376 -0.157753476 -0.062366196
## 267 268 269 270 271 272
## 0.247020707 0.754328970 0.611309395 0.302421578 -1.679372276 1.275934096
## 273 274 275 276 277 278
## 0.919325367 0.663566755 -1.243322073 -0.564916091 -1.073322051 -0.346134426
## 279 280 281 282 283 284
## 0.206272630 0.244275333 -0.394957053 -1.037021947 -1.445916542 -0.638908176
## 285 286 287 288 289 290
## -0.407446845 -0.099633478 -0.124449006 -0.058605659 -0.081043675 -0.025605628
## 291 292 293 294 295 296
## -0.046596030 -0.181257100 -0.040964365 0.188800443 0.172988154 0.256255858
## 297 298 299 300 301 302
## 0.542381994 1.473294847 3.448595258 1.021559128 2.268079648 0.246006499
## 303 304 305 306 307 308
## 0.254325379 0.210801734 0.219431292 0.859139967 1.317011269 1.451074985
## 309 310 311 312 313 314
## 0.717770803 0.279086214 0.179792584 0.242742330 0.188358137 0.879176742
## 315 316 317 318 319 320
## 0.944968155 1.205290173 -0.603612899 0.315980250 0.782631183 0.479616442
## 321 322 323 324 325 326
## 0.603760963 0.326531144 0.215376168 1.227697326 0.225975697 0.164092732
## 327 328 329 330 332 333
## 0.216809981 0.591621560 0.905140157 0.161405742 0.252714439 0.148330116
## 334 335 336 338 339 340
## 1.908467119 0.473169063 0.588562917 0.330008196 0.591306470 0.744363290
## 341 342 343 344 345 346
## 0.845034251 1.340067421 0.489950283 0.124296802 0.164125428 0.128869410
## 347 348 349 350 351 352
## 0.235618019 0.205327193 0.189358312 0.201968364 0.223099345 0.207305008
## 353 354 356 357 358 359
## 0.324105599 0.282292806 0.300728040 0.372225160 0.817587412 0.517447026
## 360 361 362 363 364 365
## 1.085283316 0.963488012 1.571995307 1.593592505 -0.387867568 3.087515700
## 366 367 368 369 370 371
## -0.969033377 -0.694510242 0.707536191 0.697883106 0.573253456 0.565277063
## 372 373 374 376 377 378
## 0.613456909 0.960768964 0.966596067 0.251644295 0.228655942 0.213652424
## 379 380 381 382 383 384
## 0.206356799 0.262899340 0.442025102 0.300839178 0.303798256 0.185707665
## 385 386 387 388 389 390
## 0.191312675 0.200008186 1.691659674 1.690268265 0.828746484 -0.587950700
## 391 392 393 394 395 396
## 0.869177867 0.486526954 1.070204156 0.785055961 0.223975195 0.434738062
## 397 398
## 0.587146082 0.691085043
plot(logistic_simple, which = 1)
#There is a fixed pattern in the residuals vs fitted plot which means that the selected independent variables will not explain the dependent variable well.
###4.Prediction
# Make predictions on the same dataset
predicted_values <- predict(logistic_simple, type = "response")
# Convert predicted probabilities to binary predictions (0 or 1) based on a threshold 0.5
predicted_class <- ifelse(predicted_values > 0.5, 1, 0)
predicted_class
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 34 35 36 37 38 39 40 41
## 1 1 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0
## 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
## 0 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
## 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
## 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1
## 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
## 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
## 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 1 1 1 0
## 122 123 124 125 126 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
## 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1
## 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
## 0 0 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1
## 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
## 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0
## 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
## 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0
## 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
## 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
## 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
## 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0
## 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
## 0 0 0 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0 0
## 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
## 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 1
## 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
## 1 1 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1
## 323 324 325 326 327 328 329 330 332 333 334 335 336 338 339 340 341 342 343 344
## 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1
## 345 346 347 348 349 350 351 352 353 354 356 357 358 359 360 361 362 363 364 365
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
## 366 367 368 369 370 371 372 373 374 376 377 378 379 380 381 382 383 384 385 386
## 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 387 388 389 390 391 392 393 394 395 396 397 398
## 0 0 1 0 1 1 1 1 1 1 1 1
###5.Accuracy
#we can compute accuracy by comparing predicted values with the original data
original <- autompg$new_mpg # Assuming new_mpg contains binary response variable (0 or 1)
accuracy <- mean(predicted_class == original)
print(accuracy)
## [1] 0.8979592
#Accuracy Provides an overall measure of model performance. An accuracy of 0.716 indicates that the logistic regression model correctly predicted the outcome (the value of new_mpg) approximately 71.6% correctly.
###Visualizations
# Install pROC package (only need to run once)
#install.packages("pROC")
# Load the pROC package
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
autompg$new_mpg
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0
## [112] 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
## [149] 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1
## [186] 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0
## [260] 0 0 0 0 0 1 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
length(autompg$new_mpg)
## [1] 392
length(predicted_class)
## [1] 392
common <- intersect(seq_along(original), seq_along(predicted_class))
# Compute ROC curve for the logistic regression model
roc_curve <- roc(original[common], predicted_class[common])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC curve
plot(roc_curve, legacy.axes = TRUE, main = "ROC Curve for Logistic Regression Model")
roc(original[common],predicted_class[common],plot=TRUE, legacy.axes=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4, percent=TRUE, print.auc=TRUE, partial.auc=c(100, 90), auc.polygon = TRUE, auc.polygon.col = "#377eb822", print.auc.x=45)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = original[common], predictor = predicted_class[common], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "#377eb8", lwd = 4, print.auc = TRUE, partial.auc = c(100, 90), auc.polygon = TRUE, auc.polygon.col = "#377eb822", print.auc.x = 45)
##
## Data: predicted_class[common] in 196 controls (original[common] 0) < 196 cases (original[common] 1).
## Partial area under the curve (specificity 100%-90%): 3.891%
#ROC curves are typically used to evaluate the performance of binary classification models, where the predicted classes represent binary outcomes (e.g., positive/negative, yes/no). Since the auto-mpg dataset involves predicting a continuous outcome (mpg), using a ROC curve isn’t appropriate.
# Creating a categorical column for the predictor variable autompg based on a threshold.
auto$mpg_class <- ifelse(auto$mpg > 20, "High MPG", "Low MPG")
# Splitting the dataset into 75% training and 25% test sets
smp_size_raw <- floor(0.75 * nrow(auto))
train_ind_raw <- sample(nrow(auto), size = smp_size_raw)
train_raw.df <- auto[train_ind_raw, ]
test_raw.df <- auto[-train_ind_raw, ]
###1.Model Development
#install.packages("MASS")
library(MASS)
#Fitting LDA model on the training set
lda_model <- lda(mpg_class ~ cylinders + displacement + horsepower + weight + acceleration, data = train_raw.df)
lda_model
## Call:
## lda(mpg_class ~ cylinders + displacement + horsepower + weight +
## acceleration, data = train_raw.df)
##
## Prior probabilities of groups:
## High MPG Low MPG
## 0.585034 0.414966
##
## Group means:
## cylinders displacement horsepower weight acceleration
## High MPG 4.343023 124.9331 81.86628 2411.599 16.50872
## Low MPG 7.081967 294.3525 137.88525 3768.025 14.21148
##
## Coefficients of linear discriminants:
## LD1
## cylinders 0.448897513
## displacement 0.002228421
## horsepower -0.003676862
## weight 0.000970897
## acceleration -0.021467555
#Prior Probabilities denote proportion of each class before modelling in our case high-mpg is more prevalent in the training set. #Faster acceleration is seen in high-mpg vehicles and heavy weight vehicles have low-mpg. #vehicles with more cylinders, larger displacement, higher horsepower, and heavier weight tend to be in the ‘Low MPG’ class. Vehicles with quicker acceleration tend to be in the ‘High MPG’ class #Cylinders is the most significant coefficient.The coefficients suggest which variables most strongly contribute to classifying ‘Low MPG’ and ‘High MPG’.
###2.Model Acceptance
summary(lda_model)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 10 -none- numeric
## scaling 5 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
#with significant coefficients and no obvious issues with class imbalance, the LDA model is likely acceptable. Here only cylinders variable plays a vital role. LDA model is moderately acceptable.
plot(lda_model,dimen = 1, type = "b")
plot(lda_model)
###3.Residual Analysis
# Residual Analysis
residuals(lda_model)
## NULL
#This suggests there may be an underlying issue with overfitting or data inconsistency.
###4.Prediction
prediction <- predict(lda_model, test_raw.df)
prediction
## $class
## [1] Low MPG Low MPG High MPG High MPG High MPG Low MPG Low MPG High MPG
## [9] Low MPG Low MPG High MPG High MPG High MPG Low MPG Low MPG Low MPG
## [17] Low MPG Low MPG High MPG High MPG High MPG High MPG Low MPG Low MPG
## [25] Low MPG Low MPG Low MPG High MPG High MPG High MPG High MPG Low MPG
## [33] Low MPG High MPG High MPG High MPG Low MPG Low MPG Low MPG High MPG
## [41] High MPG High MPG High MPG High MPG High MPG Low MPG High MPG High MPG
## [49] High MPG Low MPG Low MPG High MPG Low MPG Low MPG High MPG High MPG
## [57] Low MPG Low MPG Low MPG High MPG High MPG High MPG Low MPG Low MPG
## [65] Low MPG High MPG High MPG Low MPG Low MPG Low MPG Low MPG High MPG
## [73] High MPG Low MPG Low MPG Low MPG High MPG High MPG High MPG High MPG
## [81] High MPG High MPG High MPG High MPG High MPG High MPG High MPG High MPG
## [89] High MPG High MPG High MPG High MPG High MPG High MPG High MPG High MPG
## [97] High MPG High MPG
## Levels: High MPG Low MPG
##
## $posterior
## High MPG Low MPG
## 1 0.0146833423 0.985316658
## 13 0.0044283727 0.995571627
## 17 0.6852000603 0.314799940
## 18 0.7652426253 0.234757375
## 21 0.9837332996 0.016266700
## 27 0.0034750859 0.996524914
## 28 0.0032366609 0.996763339
## 30 0.9958089254 0.004191075
## 42 0.0036777936 0.996322206
## 48 0.2888853048 0.711114695
## 50 0.9934684803 0.006531520
## 53 0.9962365168 0.003763483
## 58 0.9939697445 0.006030256
## 63 0.0020576602 0.997942340
## 65 0.0034126801 0.996587320
## 70 0.0013120040 0.998687996
## 73 0.0067126906 0.993287309
## 74 0.0033904686 0.996609531
## 77 0.9679939252 0.032006075
## 78 0.9876875096 0.012312490
## 80 0.9951929097 0.004807090
## 82 0.9947030050 0.005296995
## 89 0.0044910542 0.995508946
## 90 0.0083726789 0.991627321
## 91 0.0002782045 0.999721795
## 94 0.0027552606 0.997244739
## 104 0.0002098663 0.999790134
## 109 0.9952156110 0.004784389
## 115 0.9944116767 0.005588323
## 118 0.9980852907 0.001914709
## 119 0.9945427087 0.005457291
## 126 0.4854362800 0.514563720
## 138 0.0006550645 0.999344935
## 147 0.9954809460 0.004519054
## 149 0.9924586227 0.007541377
## 151 0.9919323530 0.008067647
## 154 0.2199284147 0.780071585
## 156 0.3577349698 0.642265030
## 157 0.0005365084 0.999463492
## 170 0.5642299803 0.435770020
## 172 0.9765137182 0.023486282
## 174 0.9885698419 0.011430158
## 179 0.9628631272 0.037136873
## 182 0.9980339696 0.001966030
## 185 0.9834116280 0.016588372
## 193 0.2553279770 0.744672023
## 197 0.9960875735 0.003912426
## 198 0.9970744507 0.002925549
## 199 0.9980222782 0.001977722
## 200 0.1711269923 0.828873008
## 203 0.3553631843 0.644636816
## 207 0.9785511272 0.021448873
## 212 0.1770987627 0.822901237
## 213 0.0018140837 0.998185916
## 220 0.9933103519 0.006689648
## 221 0.9975153688 0.002484631
## 227 0.2681367750 0.731863225
## 228 0.1792792464 0.820720754
## 230 0.0019301631 0.998069837
## 238 0.9962071234 0.003792877
## 239 0.9964755521 0.003524448
## 244 0.9954899085 0.004510092
## 251 0.0088215802 0.991178420
## 253 0.2381215253 0.761878475
## 254 0.4720263105 0.527973690
## 255 0.5384612737 0.461538726
## 256 0.9752455351 0.024754465
## 259 0.2791786793 0.720821321
## 260 0.4813843739 0.518615626
## 262 0.2526354926 0.747364507
## 265 0.0346225635 0.965377437
## 272 0.9772027007 0.022797299
## 280 0.9953778014 0.004622199
## 287 0.0090472894 0.990952711
## 288 0.0039242697 0.996075730
## 290 0.0017530236 0.998246976
## 298 0.5392310707 0.460768929
## 302 0.9931239145 0.006876085
## 303 0.9945579320 0.005442068
## 304 0.9972277972 0.002772203
## 305 0.9951607930 0.004839207
## 309 0.9808503650 0.019149635
## 314 0.9782125558 0.021787444
## 319 0.9772717540 0.022728246
## 325 0.9964721243 0.003527876
## 327 0.9939553625 0.006044637
## 342 0.7368686838 0.263131316
## 345 0.9977447542 0.002255246
## 353 0.9928041844 0.007195816
## 361 0.5258697620 0.474130238
## 366 0.5016648734 0.498335127
## 372 0.9855625494 0.014437451
## 379 0.9944709880 0.005529012
## 380 0.9957689102 0.004231090
## 387 0.6490980497 0.350901950
## 389 0.9806394186 0.019360581
## 392 0.9885880518 0.011411948
## 396 0.9898487521 0.010151248
##
## $x
## LD1
## 1 1.87939879
## 13 2.31629410
## 17 0.07837952
## 18 -0.06756123
## 21 -1.12294295
## 27 2.40423794
## 28 2.43000866
## 30 -1.61741279
## 42 2.38367764
## 48 0.68494785
## 50 -1.45623404
## 53 -1.65645453
## 58 -1.48527089
## 63 2.59412140
## 65 2.41080885
## 70 2.75700763
## 73 2.16515054
## 74 2.41317650
## 77 -0.87254509
## 78 -1.22503303
## 80 -1.56763435
## 82 -1.53238740
## 89 2.31119235
## 90 2.08469528
## 91 3.31783213
## 94 2.48837279
## 104 3.41971880
## 109 -1.56935314
## 115 -1.51293455
## 118 -1.90132305
## 119 -1.52155602
## 126 0.38049148
## 138 3.00823667
## 147 -1.59006701
## 149 -1.40391575
## 151 -1.37934791
## 154 0.81694475
## 156 0.57090477
## 157 3.08042532
## 170 0.26607862
## 172 -0.98755548
## 174 -1.25222593
## 179 -0.81689618
## 182 -1.89174629
## 185 -1.11574869
## 193 0.74623100
## 197 -1.64237514
## 198 -1.74777107
## 199 -1.88959954
## 200 0.92953592
## 203 0.57464054
## 207 -1.02109990
## 212 0.91452783
## 213 2.63973679
## 220 -1.44753223
## 221 -1.80696164
## 227 0.72227346
## 228 0.90914708
## 230 2.61728186
## 238 -1.65363256
## 239 -1.68025397
## 244 -1.59078764
## 251 2.06565894
## 253 0.77969675
## 254 0.39991102
## 255 0.30373135
## 256 -0.96808226
## 259 0.70219740
## 260 0.38635473
## 262 0.75136603
## 265 1.56204076
## 272 -0.99856959
## 280 -1.58187450
## 287 2.05644721
## 288 2.36014789
## 290 2.65213125
## 298 0.30261190
## 302 -1.43753129
## 303 -1.52257098
## 304 -1.76728219
## 305 -1.56521644
## 309 -1.06292184
## 314 -1.01531534
## 319 -0.99969134
## 325 -1.67990146
## 327 -1.48440486
## 342 -0.01267668
## 345 -1.84204781
## 353 -1.42099113
## 361 0.32200817
## 366 0.35702815
## 372 -1.16672237
## 379 -1.51681186
## 380 -1.61396449
## 387 0.13717132
## 389 -1.05888527
## 392 -1.25280874
## 396 -1.29557143
#This prediction given us an idea about how the model classifies the test data.
###5.Model Accuracy
# Predict on the test set
predicted_classes <- predict(lda_model, test_raw.df)$class
# Create a confusion matrix to understand misclassifications
confusion_matrix <- table(predicted_classes, test_raw.df$mpg_class)
confusion_matrix
##
## predicted_classes High MPG Low MPG
## High MPG 55 4
## Low MPG 5 34
accuracy <- sum(predicted_classes == test_raw.df$mpg_class) / nrow(test_raw.df)
accuracy
## [1] 0.9081633
#This model gives 90% accuracy for the autompg dataset with high-mpg and low-mpg labels which is a good sign.
###Visualizations
str(train_raw.df)
## 'data.frame': 294 obs. of 10 variables:
## $ mpg : num 21 13 21.5 18 34.4 36.4 19 19 15.5 30 ...
## $ cylinders : int 6 8 6 8 4 5 4 6 8 4 ...
## $ displacement: num 155 302 231 318 98 121 121 156 304 98 ...
## $ horsepower : int 107 130 115 150 65 67 112 108 120 68 ...
## $ weight : int 2472 3870 3245 3436 2045 2950 2868 2930 3962 2155 ...
## $ acceleration: num 14 15 15.4 11 16.2 19.9 15.5 15.5 13.9 16.5 ...
## $ model.year : int 73 76 79 70 81 80 73 76 76 78 ...
## $ origin : int 1 1 1 1 1 2 2 3 1 1 ...
## $ car.name : chr "mercury capri v6" "ford f108" "pontiac lemans v6" "plymouth satellite" ...
## $ mpg_class : chr "High MPG" "Low MPG" "High MPG" "Low MPG" ...
## - attr(*, "na.action")= 'omit' Named int [1:6] 33 127 331 337 355 375
## ..- attr(*, "names")= chr [1:6] "33" "127" "331" "337" ...
#install.packages("klaR")
library(klaR)
#summarizing the key learnings and takeaways from the analysis:
#Trends in Fuel Efficiency: Higher cylinder count, greater horsepower, and heavier vehicles generally result in lower MPG.
#Predictive Capability: The linear regression model provides a reasonable level of accuracy in predicting MPG based on vehicle characteristics.
#Future Improvements: Consider adding polynomial terms or applying advanced models like Random Forests to capture non-linear relationships.
#Impact on Environmental Policies: Insights from this dataset can help inform fuel economy standards and influence automotive design towards more efficient vehicles.